/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     <?xml version="1.0" encoding="UTF-8"?>
     <classpath>
     	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins">
     		<accessrules>
     			<accessrule kind="accessible" pattern="**"/>
     		</accessrules>
     	</classpathentry>
     	<classpathentry kind="src" path="src"/>
     	<classpathentry kind="output" path="bin"/>
     </classpath>

     Manifest-Version: 1.0
     Require-Bundle: org.txm.utils;bundle-version="1.0.0";visibility:=reexp
      ort,org.eclipse.core.runtime;bundle-version="3.10.0";visibility:=reex
      port,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,or
      g.eclipse.ui;visibility:=reexport,org.txm.concordance.rcp;visibility:
      =reexport,org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibi
      lity:=reexport,org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor
      t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.txm.pa
      ra.core;visibility:=reexport,org.txm.searchengine.core;bundle-version
      ="1.0.0";visibility:=reexport
     Require-Bundle: org.txm.concordance.rcp;visibility:=reexport,
      org.txm.para.core;visibility:=reexport
     Export-Package: org.txm.para.rcp.editors,
      org.txm.para.rcp.handlers,
      org.txm.para.rcp.messages

        </license>
        <requires>
           <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
           <import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.rcp" version="0.7.7" match="greaterOrEqual"/>
           <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/>
           <import plugin="org.eclipse.jface.text"/>
           <import plugin="org.eclipse.ui.editors"/>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.r.core"/>
           <import plugin="org.eclipse.core.expressions" version="3.4.600" match="greaterOrEqual"/>
           <import plugin="org.txm.textsbalance.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.rcp" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
        </requires>
        <plugin

     <?xml version="1.0" encoding="UTF-8"?>
     <classpath>
     	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins"/>
     	<classpathentry kind="con" path="org.eclipse.pde.core.requiredPlugins">
     		<accessrules>
     			<accessrule kind="accessible" pattern="**"/>
     		</accessrules>
     	</classpathentry>
     	<classpathentry kind="src" path="src"/>
     	<classpathentry kind="output" path="bin"/>
     </classpath>

     Manifest-Version: 1.0
     Require-Bundle: org.txm.libs.groovy-all;bundle-version="2.3.3";visibil
      ity:=reexport,org.txm.utils;bundle-version="1.0.0";visibility:=reexpo
      rt,org.eclipse.osgi;bundle-version="3.10.2";visibility:=reexport,org.
      eclipse.core.runtime;bundle-version="3.10.0";visibility:=reexport,org
      .txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexpor
      t,org.txm.core;bundle-version="0.7.0";visibility:=reexport,org.eclips
      e.ui;visibility:=reexport,org.txm.searchengine.core;bundle-version="1
      .0.0";visibility:=reexport
     Require-Bundle: org.txm.searchengine.cqp.core;bundle-version="1.1.0";visibility:=reexport
     Export-Package: org.txm.para.core.functions
     Bundle-ActivationPolicy: lazy
     Bundle-Version: 1.0.0.qualifier

        </license>
        <requires>
           <import plugin="org.txm.utils"/>
           <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
           <import plugin="org.txm.core" version="0.8.0" match="greaterOrEqual"/>
           <import plugin="org.txm.treetagger.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui"/>
           <import plugin="org.eclipse.swt"/>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
           <import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/>
           <import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/>
        </requires>
        <plugin

     import org.apache.commons.lang.StringUtils;
     import org.eclipse.core.runtime.IProgressMonitor;
     import org.txm.Toolbox;
     import org.txm.importer.graal.PersonalNamespaceContext;
     import org.txm.importer.PersonalNamespaceContext;
     import org.txm.searchengine.cqp.AbstractCqiClient;
     import org.txm.searchengine.cqp.CQPSearchEngine;
     import org.txm.searchengine.cqp.corpus.MainCorpus;

     import org.eclipse.core.runtime.IProgressMonitor;
     import org.txm.Toolbox;
     import org.txm.importer.graal.PersonalNamespaceContext;
     import org.txm.importer.PersonalNamespaceContext;
     import org.txm.searchengine.cqp.AbstractCqiClient;
     import org.txm.searchengine.cqp.CQPSearchEngine;
     import org.txm.searchengine.cqp.corpus.MainCorpus;

     import org.txm.Toolbox;
     import org.txm.importer.StaxIdentityParser;
     import org.txm.importer.StaxStackWriter;
     import org.txm.importer.graal.PersonalNamespaceContext;
     import org.txm.importer.PersonalNamespaceContext;
     import org.txm.objects.BaseParameters;
     import org.txm.rcp.Application;
     import org.txm.rcp.TxmPreferences;

        </license>
        <requires>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.txm.utils" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.core.runtime" version="3.10.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.jfreechart.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
           <import plugin="org.eclipse.osgi" version="3.10.2" match="greaterOrEqual"/>
           <import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui" version="3.106.1" match="greaterOrEqual"/>
           <import plugin="org.txm.wordcloud.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.chartsengine.rcp"/>
        </requires>

        </license>
        <requires>
           <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
           <import plugin="org.txm.libs.groovy-all" version="2.3.3" match="greaterOrEqual"/>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui"/>
           <import plugin="org.eclipse.core.runtime"/>
           <import plugin="org.txm.utils"/>
           <import plugin="org.eclipse.persistence.jpa" version="2.6.0" match="greaterOrEqual"/>
           <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="javax.persistence"/>
           <import plugin="org.txm.annotation.kr.core" version="1.0.0" match="greaterOrEqual"/>
        </requires>

     Manifest-Version: 1.0
     Require-Bundle: org.txm.rcp;bundle-version="0.7.8";visibility:=reexpor
      t,org.txm.utils;bundle-version="1.0.0";visibility:=reexport,org.eclip
      se.osgi;bundle-version="3.10.2";visibility:=reexport,org.txm.index.rc
      p;bundle-version="1.0.0";visibility:=reexport,org.eclipse.core.runtim
      e;bundle-version="3.10.0";visibility:=reexport,org.txm.chartsengine.c
      ore;bundle-version="1.0.0";visibility:=reexport,org.txm.index.core;bu
      ndle-version="1.0.0";visibility:=reexport,org.txm.searchengine.cqp.co
      re;bundle-version="1.1.0";visibility:=reexport,org.eclipse.ui;bundle-
      version="3.106.1";visibility:=reexport,org.txm.wordcloud.core;bundle-
      version="1.0.0";visibility:=reexport,org.txm.chartsengine.rcp;visibil
      ity:=reexport,org.txm.core;bundle-version="0.7.0";visibility:=reexpor
+     t
     Require-Bundle: org.txm.index.rcp;bundle-version="1.0.0";visibility:=reexport,
      org.txm.wordcloud.core;bundle-version="1.0.0";visibility:=reexport,
      org.txm.chartsengine.rcp;visibility:=reexport
     Bundle-Vendor: Textometrie.org
     Bundle-ActivationPolicy: lazy
     Bundle-Version: 1.0.0.qualifier

        </license>
        <requires>
           <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui"/>
           <import plugin="org.eclipse.core.runtime"/>
           <import plugin="org.eclipse.ui.editors" version="3.8.200" match="greaterOrEqual"/>
           <import plugin="org.eclipse.swt"/>
           <import plugin="org.txm.searchengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.searchengine.cqp.core" version="1.1.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.r.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.statsengine.r.rcp" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.index.core" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.index.rcp" version="1.0.0" match="greaterOrEqual"/>
           <import plugin="org.txm.utils"/>
           <import plugin="org.txm.groovy.core" version="1.0.0" match="greaterOrEqual"/>
        </requires>
        <plugin

        </license>
        <requires>
           <import plugin="org.txm.core" version="0.7.0" match="greaterOrEqual"/>
           <import plugin="org.txm.rcp" version="0.7.8" match="greaterOrEqual"/>
           <import plugin="org.eclipse.ui"/>
           <import plugin="org.eclipse.core.runtime"/>
           <import plugin="org.eclipse.ui.editors" version="3.8.100" match="greaterOrEqual"/>
           <import plugin="org.eclipse.core.expressions" version="3.4.500" match="greaterOrEqual"/>
           <import plugin="org.txm.index.core"/>
           <import plugin="org.txm.lexicaltable.core"/>
           <import plugin="org.txm.statsengine.r.core"/>
        </requires>
        <plugin

tmp/org.txm.tigersearch.rcp/.settings/org.eclipse.jdt.groovy.core.prefs (revision 911)
1	1	eclipse.preferences.version=1
2		groovy.compiler.level=23
	2	groovy.compiler.level=-1
	3	groovy.script.filters=*/.dsld,y,*/.gradle,n

      ls.debugger,org.mozilla.javascript.tools.idswitch,org.mozilla.javascr
      ipt.tools.jsc,org.mozilla.javascript.tools.shell,org.relaxng.datatype
      ,org.relaxng.datatype.helpers,org.txm.export.ts,org.txm.function.tige
      rsearch,org.txm.importer.srcmf,org.txm.importer.tigersearch,org.txm.s
      rsearch,org.txm.s
      earchengine.ts,org.txm.test,org.txm.tigersearch.commands,org.txm.tige
      rsearch.editors,org.w3c.css.sac,org.w3c.css.sac.helpers,org.w3c.dom.s
      mil,org.w3c.dom.svg,tigerAPI,tigerAPI.converters,tigerAPI.theories.ho

     // Script to restore punctuation etc. to results of TS query.
     // Inputs:
     // --- Tiger-XML Document node
     // --- Java array:
     // --- --- String [index][type] where:
     // --- --- --- type == 0 gives the xml:id
     // --- --- --- type == 1 gives the word form
     // Process:
     // --- Injects punctuation.
     // Returns:
     // --- Tiger-XML Document node.
     // TXM package statement
     package org.txm.importer.srcmf
     import javax.xml.parsers.DocumentBuilderFactory
     /**
+     *
      * @author tmr
+     *
      */
     class PunctInjectImport
+    {
     	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
     	static nsTei = new groovy.xml.Namespace('http://www.tei-c.org/ns/1.0', 'tei')
     	def tigerXml = null
     	def txmIdWordTableFixed = []
     	def outputFile = null
     	/*
     	 * Call this constructor when no need of XML-TXM file for word ID table
     	 */
     	PunctInjectImport(File tigerXmlFile, ArrayList txmIdWordTable) {
     		def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
     		tigerXml = builder.parse(tigerXmlFile).documentElement
     		txmIdWordTableFixed = txmIdWordTable
     		outputFile = new File("output.xml")
+    	}
     	/*
     	 * Call this constructor when the XML-TXM file is needed for the word list.
     	 */
     	PunctInjectImport(File tigerXmlFile, File xmlTxmFile) {
     		//
     		def builder = DocumentBuilderFactory.newInstance().newDocumentBuilder()
     		tigerXml = builder.parse(tigerXmlFile).documentElement
     		txmIdWordTableFixed = []
     		def xmlTxmGroovyDOM = new XmlParser().parse(xmlTxmFile)
     		for (def w : xmlTxmGroovyDOM.text.'**'.findAll { it.name() == nsTei.'w' } ) {
     			def anId = null
     			anId = w.'@id'
     			if (! anId) {
     				anId = w.attribute(nsXml.id)
+    			}
     			if (! anId) {
     				println "Error: ID attribute not located in XML TXM file"
+    			}
     			txmIdWordTableFixed << [anId, getDeepText(w."txm:form"[0])]
+    		}
     		// assert txmIdWordTableFixed.size() > 0
     		outputFile = new File("output.xml")
+    	}
     	def process () {
     		def txmIdWordTable = new ArrayList(this.txmIdWordTableFixed)
     		def allTs = toList(this.tigerXml.getElementsByTagName('t'))
     		def tAttrs = getTAttrNames(allTs[0])
     		def tNodeIdPrefix = allTs[0].getAttribute('id').tokenize('#').first()
     		def addedWordIds = []
     		int tIx = 0
     		while (txmIdWordTable) {
     			def word = txmIdWordTable.remove(0)
     			if (tIx == allTs.size()) {
     				// End of TS file, but still words left in the BFM file.
     				addTNode(word, allTs.last(), tAttrs, tNodeIdPrefix, 'append')
+    			}
     			else {
     				def tNode = allTs[tIx]
     				def tId = getTNodeId(tNode)
     				if (tId == word[0] && tNode.getAttribute('word') == word[1]) {
     					// alles gut
     					tIx += 1
+    				}
     				else if (tId == word[0]) {
     					println("Mismatched Ids! ($tId)")
     					tIx += 1
+    				}
     				else if (['#', '*'].contains(tNode.getAttribute('word')) ) {
     					// SRCMF duplicata; try comparing word against the next tNode next time
     					// around.
     					txmIdWordTable.add(0, word)
     					tIx += 1
+    				}
     				// Check that the SRCMF corpus doesn't have a bug in it...
     				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
     				&& (allTs[0..tIx - 1].find{
     					it.getAttribute('id') == "$tNodeIdPrefix#$tId"
     				})) {
     					println "Warning: word ${tNode.getAttribute('word')}, id $tId appears twice in corpus!"
     					txmIdWordTable.add(0, word)
     					tIx += 1
+    				}
     				// Check that there's not an extra word in the SRCMF corpus (rare, usu. a tokenisation change)
     				else if ( !(word[1] =~ /[\,\.\?\!\:\;\(\)\[\]\{\}]/)
     				&& (allTs[tIx..-1].find{
     					it.getAttribute('id') == "$tNodeIdPrefix#${word[0]}"
     				})) {
     					println "Warning: word ${tNode.getAttribute('word')}, id $tId does not appear in BFM!"
     					txmIdWordTable.add(0, word)
     					tIx += 1
+    				}
     				else if (addedWordIds.contains(tId)) {
     					println "Warning: word ${tNode.getAttribute('word')}, id ${tId} out of sequence in SRCMF corpus!"
     					txmIdWordTable.add(0, word)
     					tIx += 1
+    				}
     				else {
     					// Insert word.  In the first instance, it will have the same parent as
     					// the tNode before which it's being inserted.
     					addTNode(word, allTs[tIx], tAttrs, tNodeIdPrefix, 'before')
     					addedWordIds.add(word[0])
+    				}
+    			}
+    		}
     		// Second phase: move punctuation into previous sentence,
     		// dependent on sequence.
     		def allTerminalses = toList(this.tigerXml.getElementsByTagName('terminals'))
     		for (def i = 1 ; i < allTerminalses.size() ; i++) {
     			def ts = toList(allTerminalses[i].getElementsByTagName('t'))
     			def startPunc = true
     			def puncStack = []
     			while (ts && startPunc) {
     				if ((ts[0].getAttribute('word') =~ /[A-zÀ-ÿ0-9]/).size() == 0) {
     					puncStack.add(ts.remove(0))
+    				}
     				else {
     					startPunc = false
+    				}
+    			}
     			// Now, treat the punctuation stack at the beginning of the sentence
     			if ( puncStack ) {
     				int moveLeft = 0
     				// First, identify LAST instance of sentence-final punctuation.
     				def puncString = puncStack.collect{ it.getAttribute('word')[0] }.join('')
     				def matches = puncString =~ /[\.\,\;\:\!\?\)\]\}»”’]/
     				if (matches.size() > 0) {
     					moveLeft = puncString.lastIndexOf(matches[-1]) + 1
+    				}
     				// Second, split pairs of straight quotes
     				matches = puncString =~ /(""|'')/ //"
     				if (matches.size() > 0) {
     					moveLeft = [moveLeft, puncString.lastIndexOf(matches[-1][0]) + 1].max()
+    				}
     				// Now, move moveLeft punctuation nodes to the end of the prev. sentence
     				ts = toList(allTerminalses[i].getElementsByTagName('t'))
     				for (def j = 0 ; j < moveLeft ; j++ ) {
     					allTerminalses[i - 1].appendChild(ts[j])
+    				}
+    			}
+    		}
     		outputFile.withWriter("UTF-8") { writer ->
     			writer.println(this.tigerXml)
+    		}
+    	}
     	private addTNode(word, tNode, tAttrs, tNodeIdPrefix, where) {
     		def newTNode = tNode.getOwnerDocument().createElement('t')
     		for (def anAttr : tAttrs) {
     			if (anAttr == 'id') {
     				newTNode.setAttribute('id', "${tNodeIdPrefix}#${word[0]}")
+    			}
     			else if (anAttr == 'word') {
     				newTNode.setAttribute('word', word[1])
+    			}
     			else {
     				newTNode.setAttribute(anAttr, '--')
+    			}
+    		}
     		if (where == 'before') {
     			tNode.getParentNode().insertBefore(newTNode, tNode)
+    		}
     		else if (where == 'append') {
     			tNode.getParentNode().appendChild(newTNode)
+    		}
     		else {
     			throw new IllegalArgumentException('Bad before value')
+    		}
+    	}
     	def getTAttrNames(tNode) {
     		def nodeMap = tNode.attributes
     		def nameList = []
     		for ( def i = 0 ; i < nodeMap.getLength() ; i++ ) {
     			nameList.add( nodeMap.item(i).nodeName )
+    		}
     		return nameList
+    	}
     	def getTNodeId(tNode) {
     		return tNode.getAttribute('id').tokenize('#').last()
+    	}
     	def toList(def iterable) {
     		return iterable.findAll {true};
+    	}
     	def getDeepText(def node) {
     		def str = ""
     		node.children().each { child ->
     			if (child instanceof Node) {
     				str += getDeepText(child)
     			} else if (child instanceof String) {
     				str += child
+    			}
+    		}
     		return str
+    	}
     	public static void main(String[] args)
+    	{
     		def cli = new CliBuilder(
     				usage:'PunctInjectImport.groovy TigerXml.xml XmlTxm.xml'
+    				)
     		cli.h(longOpt:'help', 'Prints this message.')
     		cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
     		def options = cli.parse(args)
     		def tigerXmlFile = null
     		def xmlTxmFile = null
     		if (options.arguments().size() == 2) {
     			tigerXmlFile = new File(options.arguments()[0])
     			xmlTxmFile = new File(options.arguments()[1])
     		} else {
     			println 'Incorrect number of command line arguments... exiting'
     			println cli.usage()
     			System.exit(2)
+    		}
     		def pii = new PunctInjectImport(tigerXmlFile, xmlTxmFile)
     		if (options.o) {
     			pii.outputFile = new File(options.o)
+    		}
     		pii.process()
+    	}
+    }

     package org.txm.importer.srcmf
     // srcdir
     // txml/master.xml
     // txml/*.xml
     // txm/*.xml
     // 1 import xml-txm
     import ims.tiger.gui.tigerregistry.TIGERRegistry;
     import ims.tiger.index.writer.IndexBuilderErrorHandler;
     import ims.tiger.index.writer.SimpleErrorHandler;
     import ims.tiger.index.writer.XMLIndexing;
     import ims.tiger.system.*;
     import javax.xml.stream.*;
     import java.io.BufferedWriter;
     import java.io.File;
     import java.io.FileOutputStream;
     import java.io.IOException;
     import java.io.OutputStreamWriter;
     import java.io.PrintStream;
     import java.net.URL;
     import java.util.HashMap;
     import java.util.logging.FileHandler
     import java.io.File;
     import org.apache.log4j.BasicConfigurator;
     import org.txm.export.ts.*;
     import org.txm.importer.ApplyXsl2;
     import org.txm.utils.xml.DomUtils;
     import org.txm.importer.ValidateXml;
     import org.txm.importer.XPathResult;
     import org.txm.importer.xmltxm.compiler;
     import org.txm.importer.xml.pager;
     import org.txm.objects.*;
     import org.txm.utils.*;
     import org.txm.*;
     import org.txm.scripts.teitxm.*;
     import org.txm.metadatas.*;
     import org.txm.utils.i18n.*;
     import org.w3c.dom.Element
     import org.xml.sax.SAXException;
     import javax.xml.parsers.DocumentBuilderFactory
     String userDir = System.getProperty("user.home");
     boolean debug = false
     def MONITOR;
     BaseParameters params;
     try {params = paramsBinding;MONITOR=monitor} catch (Exception)
     {	println "DEV MODE";//exception means we debug
     	debug = true
     	params = new BaseParameters(new File(userDir, "xml/slethgier/import.xml"))
     	params.load()
     	if (!org.txm.Toolbox.isInitialized()) {
     		Toolbox.setParam(Toolbox.INSTALL_DIR,new File("/usr/lib/TXM"));
     		Toolbox.setParam(Toolbox.TREETAGGER_INSTALL_PATH,new File(userDir,"treetagger"));
     		Toolbox.setParam(Toolbox.TREETAGGER_MODELS_PATH,new File(userDir,"treetagger/models"));
     		Toolbox.setParam(Toolbox.METADATA_ENCODING, "UTF-8");
     		Toolbox.setParam(Toolbox.METADATA_COLSEPARATOR, ",");
     		Toolbox.setParam(Toolbox.METADATA_TXTSEPARATOR, "\"");
     		Toolbox.setParam(Toolbox.USER_TXM_HOME, new File(System.getProperty("user.home"), "TXM"));
+    	}
+    }
     if (params == null) { println "no parameters. Aborting"; return; }
     String corpusname = params.getCorpusName();
     Element corpusElem = params.corpora.get(corpusname);
     String basename = params.name;
     String rootDir = params.rootDir;
     String lang = corpusElem.getAttribute("lang");
     String model = lang
     String encoding = corpusElem.getAttribute("encoding");
     boolean annotate = "true" == corpusElem.getAttribute("annotate");
     String xsl = params.getXsltElement(corpusElem).getAttribute("xsl")
     def xslParams = params.getXsltParams(corpusElem);
     File txmSrcDir = new File(rootDir, "xml-txm");
     File tigerSrcDir = new File(rootDir, "tiger-xml");
     File headerFile = new File(rootDir, "tiger-xml/header.xml");
     File binDir = new File(Toolbox.getTXMHOMEPATH(),"corpora/"+basename);
     binDir.deleteDir();
     binDir.mkdirs();
     if (!binDir.exists()) {
     	println "Could not create binDir "+binDir
     	return;
+    }
     if (!txmSrcDir.exists()) {
     	println "Error: txm src dir does not exists: "+txmSrcDir
     	return;
+    }
     if (!tigerSrcDir.exists()) {
     	println "Error: tiger src dir does not exists: "+tigerSrcDir
     	return;
+    }
     File txmDir = new File(binDir,"txm/$corpusname");
     txmDir.deleteDir();
     txmDir.mkdirs();
     // copy txm files
     List<File> srcfiles = txmSrcDir.listFiles();
     for (File f : srcfiles) {// check XML format, and copy file into binDir
     	if (f.getName().equals("import.xml") || f.getName().equals("metadata.csv") || f.getName().endsWith(".properties"))
     		continue;
     	if (ValidateXml.test(f)) {
     		FileCopy.copy(f, new File(txmDir, f.getName()));
     	} else {
     		println "Won't process file "+f;
+    	}
+    }
     if (txmDir.listFiles() == null) {
     	println "No txm file to process"
     	return;
+    }
     List<File> filelist = txmDir.listFiles();
     Collections.sort(filelist);
     if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
     if (MONITOR != null) MONITOR.worked(25, "COMPILING")
     println "-- COMPILING - Building Search Engine indexes"
     println "binDir: $binDir"
     println "txmDir: $txmDir"
     def c = new compiler();
     if (debug) c.setDebug();
     c.setLang(lang);
     //c.setSortMetadata(sortMetadata)
     if (!c.run(binDir, txmDir, basename, corpusname, filelist)) {
     	println "import process stopped";
     	return;
+    }
     if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
     if (MONITOR != null) MONITOR.worked(20, "EDITION")
     println "-- EDITION - Building edition"
     new File(binDir,"HTML/$corpusname").deleteDir();
     new File(binDir,"HTML/$corpusname").mkdirs();
     File outdir = new File(binDir,"/HTML/$corpusname/default/");
     outdir.mkdirs();
     def second = 0
     println "Paginating text: "
     for (File srcfile : filelist) {
     	String txtname = srcfile.getName();
     	int i = txtname.lastIndexOf(".");
     	if(i > 0) txtname = txtname.substring(0, i);
     	List<String> NoSpaceBefore = LangFormater.getNoSpaceBefore(lang);
     	List<String> NoSpaceAfter = LangFormater.getNoSpaceAfter(lang);
     	Element text = params.addText(params.corpora.get(corpusname), txtname, srcfile);
     	if (second) { print(", ") }
     	if (second > 0 && (second++ % 5) == 0) println ""
     	print(srcfile.getName());
     	def ed = new pager(srcfile, outdir, txtname, NoSpaceBefore, NoSpaceAfter, 500, basename, "pb");
     	Element edition = params.addEdition(text, "default", outdir.getAbsolutePath(), "html");
     	for (i = 0 ; i < ed.getPageFiles().size();) {
     		File f = ed.getPageFiles().get(i);
     		String wordid = ed.getIdx().get(i);
     		params.addPage(edition, ""+(++i), wordid);
+    	}
+    }
     if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
     if (MONITOR != null) MONITOR.worked(20, "FINALIZING")
     File paramFile = new File(binDir, "import.xml");
     DomUtils.save(params.root.getOwnerDocument(), paramFile);readyToLoad = true;
     // TMR Modifications start here
     File tigerDir = new File(binDir, "tiger");
     tigerDir.mkdir();
     File master = new File(tigerSrcDir, "master.xml");
     File txmSrcFile = txmSrcDir.listFiles()[0]; // only one text file
     // Run SRCMF cmd line corpus import script
     if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
     def importer = new SrcmfImporter()
     def tigerXmlAll = importer.doAllButPnc(master, txmSrcFile, headerFile, binDir)
     // Create TigerXml-POS-PNC
     if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
     println "create TIGER XML POS PNC"
     PunctInjectImport injector;
     if (Toolbox.isInitialized()) {
     	// get idx from CQP
     	// INCOMPLETE
     	// def injector = new PunctInjectImport(tmpFile, words)
     	println "Error: the loader is not ready to be launched from RCP"
     	return;
     } else {
     	injector = new PunctInjectImport(tigerXmlAll, txmSrcFile)
+    }
     injector.outputFile = new File(tigerDir, "TigerPnc.xml")
     injector.process()
     // Create the tigersearch.logprop file (used to launch TIGERSearch)
     File logprop = new File(tigerDir, "tigersearch.logprop");
     logprop.withWriter("UTF-8") { writer ->
     	writer.write("""# Default log configuration of the TIGERSearch suite
     log4j.rootLogger=WARN,Logfile
     log4j.logger.ims.tiger.gui.tigersearch.TIGERSearch=INFO
     log4j.appender.Logfile=org.apache.log4j.RollingFileAppender
     log4j.appender.Logfile.File=\${user.home}/tigersearch/tigersearch.log
     log4j.appender.Logfile.MaxFileSize=500KB
     log4j.appender.Logfile.MaxBackupIndex=1
     log4j.appender.Logfile.layout=org.apache.log4j.PatternLayout
     log4j.appender.Logfile.layout.ConversionPattern=%5r %-5p [%t] %c{2} - %m%n""")
+    }
     // Run TigerRegistry
     if (MONITOR != null && MONITOR.isCanceled()) { return MONITOR.done(); }
     BasicConfigurator.configure();
     String uri = tigerXmlAll.getAbsolutePath();
     File tigerBinDir = new File(tigerDir, corpusname)
     tigerBinDir.mkdir()
     try {
     	IndexBuilderErrorHandler handler = new SimpleErrorHandler(tigerBinDir.getAbsolutePath());
     	XMLIndexing indexing = new XMLIndexing(corpusname,uri, tigerBinDir.getAbsolutePath(), handler,false);
     	indexing.startIndexing();
+    }
     catch (IOException e) { System.out.println("IO: "+e.getMessage()); }
     catch (SAXException e) { System.out.println("SAX: "+e.getMessage()); }
     tigerXmlAll.delete()

     /*
     Combines subcorpus files to give a single main file.
      */
     // TXM package statement
     package org.txm.importer.srcmf
     // Command-line entry point
     def cli = new CliBuilder(
         usage:'TigerSubcorpus2Main.groovy [options] tiger_master.xml'
+    )
     cli.h(longOpt:'help', 'Prints this message.')
     cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
     options = cli.parse(args)
     if (options.arguments().size() != 1) {
         println 'Incorrect number of command line arguments... exiting'
         println cli.usage()
         System.exit(2)
+    }
     def tigerMaster = new File(options.arguments()[0])
     def outputFile = new File('outputfile.xml')
     if (options.o) {
         outputFile = new File(options.o)
+    }
     script(tigerMaster, outputFile)
     def script (File tigerMaster, File outputFile) {
         def masterDoc = new XmlParser().parse(tigerMaster)
         for (def body : masterDoc.body) {
     	for (def subNode : body.subcorpus) {
     		File subCorpusFile = new File(tigerMaster.getParentFile(), (""+subNode.@external).substring(5))
     		def subDoc = new XmlParser().parse(subCorpusFile)
     		body.remove(subNode)
     		for (def child : subDoc.children())
     			body.append(child)
+    	}
+        }
         def strWriter = new StringWriter()
         new groovy.util.XmlNodePrinter(new PrintWriter(strWriter)).print(masterDoc)
         def rez = strWriter.toString()
         outputFile.withWriter("UTF-8") { writer ->
     	writer.println('<?xml version="1.0" encoding="UTF-8"?>')
     	writer.print(rez)
+        }
+    }

tmp/org.txm.tigersearch.rcp/groovy/org/txm/importer/srcmf/package.html (revision 911)
	1	<html>
	2	<body>
	3	<p>TIGERSearch import module. This is a prototype that can only manage SRCMF TIGERSearch sources</p>
	4	</body>
	5	</html>
0	6

     #! /usr/bin/groovy
     package org.txm.importer.srcmf;
     /*
      * To change this template, choose Tools | Templates
      * and open the template in the editor.
      */
     // Set up globals
     // def masterFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos.xml')
     // def outputFile = new File('/home/tomr/Documents/Work/lyon12/srcmf/groovy/SrcmfImport/aucassin_surface/master_pos2.xml')
     // def headerFile = new File('/home/tomr/Documents/Work/SRCMF/srcmf_ts/header_srcmf.xml')
     // def feats = [nt:['cat', 'type', 'coord'], t:['pos', 'form', 'q']]
     // def firstFeat = [t:'word', nt:'cat']
     // Command-line entry point
     def cli = new CliBuilder(
         usage:'MasterReheader.groovy [options] master_file.xml header_file.xml'
+    )
     cli.h(longOpt:'help', 'Prints this message.')
     cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file.')
     cli.nt(longOpt:'ntfeats', args:1, argName:'ntfeats', 'NT features for which to provide value node.')
     cli.t(longOpt:'tfeats', args:1, argName:'tfeats', 'T features for which  to provide value node.')
     cli.nt1(longOpt:'ntfeat1st', args:1, argName:'1st ntfeat', 'First NT feature listed in header.')
     cli.t1(longOpt:'tfeat1st', args:1, argName:'1st tfeat', 'First T feature listed in header.')
     options = cli.parse(args)
     if (options.arguments().size() == 2) {
         def masterFile = new File(options.arguments()[0])
         def headerFile = new File(options.arguments()[1])
         def masterFolder = masterFile.getCanonicalFile().getParent()
         def outputFile = null
         if (options.o) {
             outputFile = new File(options.o)
         } else {
             outputFile = new File(masterFolder, 'MasterReheader_out.xml')
+        }
         def ntfirst = 'cat'
         if (options.nt1) {
             ntfirst = options.nt1
+        }
         def tfirst = 'word'
         if (options.t1) {
             tfirst = options.t1
+        }
         script(
             masterFile, headerFile, outputFile,
             ['nt':options.nts, 't':options.ts],
             ['nt':options.nt1, 't':options.t1]
+        )
     } else {
         println 'Incorrect number of command line arguments... exiting'
         println cli.usage()
+    }
     def script(
         File masterFile, File headerFile, File outputFile, HashMap feats, HashMap firstFeat
     ) {
         // Load master and header files
         def master = new XmlParser().parse(masterFile)
         def header = new XmlParser().parse(headerFile)
         def masterFolder = masterFile.getCanonicalFile().getParent()
         // Set up locals
         def attrVal = [nt:[:], t:[:]]
         // Scan subcorpus files and build attribute lists.
         master.body.subcorpus.each {
             def subcorpusFile = new File (masterFolder, it.'@external'[5..-1])
             def subcorpus = new XmlParser().parse(subcorpusFile)
             // Closure for t & nt nodes processing.
             def getvals = { node, type ->
                 node.attributes().each { mEntry ->
                     if (! attrVal[type].keySet().contains(mEntry.getKey())) {
                         attrVal[type][mEntry.getKey()] = new HashSet()
+                    }
                     attrVal[type][mEntry.getKey()].add(mEntry.getValue())
+                }
+            }
             subcorpus.s.graph.terminals.t.each { getvals.call(it, 't') }
             subcorpus.s.graph.nonterminals.nt.each { getvals.call(it, 'nt') }
+        }
         // Id isn't an attribute in the header.
         attrVal['t'].remove('id')
         attrVal['nt'].remove('id')
         // Remove old feature nodes in master file
         def oldFeatureNodes = master.head.annotation.feature
         while (oldFeatureNodes) {
             node = oldFeatureNodes.pop()
             node.parent().remove(node)
+        }
         assert (! master.head.annotation.feature)
         // Check firstFeat was relevant
         ['t', 'nt'].each { type ->
             if (! (attrVal[type].keySet().contains(firstFeat[type]))) {
                 firstFeat[type] = attrVal[type].keySet().sort()[0]
+            }
+        }
         assert attrVal['t'].keySet().contains(firstFeat['t'])
         assert attrVal['nt'].keySet().contains(firstFeat['nt'])
         def featList = [:]
         ['t', 'nt'].each { type ->
             featList[type] = [firstFeat[type]]
             featList[type].addAll(attrVal[type].keySet().findAll { it != firstFeat[type] })
+        }
         // Add new feature and value nodes
         ['t', 'nt'].each { type ->
             featList[type].each { feat ->
                 def fNode = new Node(master.head.annotation[0], 'feature',
                     ['domain':type.toUpperCase(), 'name':feat]
+                )
                 // Add value node if the node value is given in 'feats'
                 if (feats[type].contains(feat)) {
                     attrVal[type][feat].each { value ->
                         assert header.'**'.feature
                         assert header.'**'.feature[0].'@name'
                         assert header.'**'.feature[0].'@domain'
                         assert ['NT', 'T'].contains(header.'**'.feature[0].'@domain')
                         def hFNode = header.'**'.feature.find {
                             it.'@name' == feat && (
                                 it.'@domain' == type.toUpperCase() || it.'@domain' == 'FREC'
+                            )
+                        }
                         def vText = '[unknown]'
                         if (hFNode && hFNode.value.find { it.'@name' == value }) {
                             vText = hFNode.value.find { it.'@name' ==  value }.text()
+                        }
                         new Node(fNode, 'value', ['name':value], vText)
+                    }
+                }
+            }
+        }
         // Save to output_file
         outputFile.withWriter { writer ->
             writer << groovy.xml.XmlUtil.serialize(master)
+        }
+    }

     /*
      * Calls all import scripts
      */
     /**
+     *
      * @author tmr
      */
     // Command line form of import statements:
     // import MasterReheader
     // import PunctInjectImport
     // import SubcorpusDataInject
     // import TigerSubcorpus2Main
     // import java.util.logging.FileHandler
     // import javax.xml.parsers.DocumentBuilderFactory
     // TXM package statement
     package org.txm.importer.srcmf
     import java.util.logging.*;
     // Command line entry point
     def cli = new CliBuilder(
         usage:'SrcmfImport.groovy [options] tiger_master.xml xml_txm.xml header_file.xml'
+    )
     cli.h(longOpt:'help', 'Prints this message.')
     options = cli.parse(args)
     if (options.arguments().size() != 3) {
         println 'Incorrect number of command line arguments... exiting'
         println cli.usage()
         System.exit(2)
+    }
     def tigerFile = new File(options.arguments()[0])
     def txmFile = new File(options.arguments()[1])
     def headerFile = new File(options.arguments()[2])
     def tigerXmlAll = doAllButPnc(
         tigerFile,
         txmFile,
         headerFile,
         txmFile.getAbsoluteFile().getParentFile().getParentFile()
+    )
     doPnc(tigerXmlAll, txmFile)
     tigerXmlAll.delete()
     def doAllButPnc(File tigerFile, File txmFile, File headerFile, File binDir) {
         // Run pos injection script
         File txmSrcDir = txmFile.getAbsoluteFile().getParentFile()
         File tigerDir = new File(binDir, "tiger")
         tigerDir.mkdir()
         File masterpos = new File(tigerDir, "master_pos.xml")
         File xmltxm = txmSrcDir.listFiles()[0]
         File logFile = new File(binDir, "tiger.log")
         def sdi = new SubcorpusDataInject(
             xmltxm,
             new FileHandler(logFile.getAbsolutePath()), "vers"
+        )
         sdi.processMaster(tigerFile, masterpos)
         // Run reheader script
         def reheader = new MasterReheader()
         File tmp = File.createTempFile("tmp", ".xml",tigerDir)
         def feats = ['nt':['cat', 'type', 'coord'], 't':['pos', 'form', 'q']]
         def firstFeat = ['nt':'cat', 't':'word']
         reheader.script(masterpos, headerFile, tmp, feats, firstFeat)
         if (!tmp.exists()) {
         	println "Error: reheader failed"
+        }
         masterpos.delete()
         tmp.renameTo(masterpos)
         // Run merge master & subcorpus script
         def tigerXmlAll = new File(masterpos.getParentFile(), "TigerAll.xml")
         def mergescript = new TigerSubcorpus2Main()
         mergescript.script(masterpos, tigerXmlAll)
         return tigerXmlAll
+    }
     def doPnc(File tigerXmlAll, File txmFile) {
         injector = new PunctInjectImport(tigerXmlAll, txmFile)
         injector.outputFile = new File(tigerXmlAll.getParentFile(), "TigerPnc.xml")
         injector.process()
+    }

     #! /usr/bin/groovy
     package org.txm.importer.srcmf;
     /*
      * To change this template, choose Tools | Templates
      * and open the template in the editor.
      */
     import java.util.logging.Logger
     import java.util.logging.FileHandler
     import java.util.logging.SimpleFormatter
     import java.util.logging.Level
     class SubcorpusDataInject {
     	// Globals
     	def textForm = 'prose'
     	def TEI = null
     	def wNodes = []
     	def wNodesInQ = []
     	// Set up namespaces
     	static nsXml = new groovy.xml.Namespace('http://www.w3.org/XML/1998/namespace', 'xml')
     	static nsTxm = new groovy.xml.Namespace('http://textometrie.org/1.0', 'txm')
     	// Set up a logger
     	static logger = Logger.getLogger('subcorpusDataInjectLog')
     	// END GLOBALS
     	SubcorpusDataInject (
     	File xmlTxmFile,
     	FileHandler fh,
     	String newTextForm
     	) {
     		// Reset global values
     		if (newTextForm) {
     			textForm = newTextForm
+    		}
     		// Set up logger
     		def formatter = new SimpleFormatter()
     		logger.addHandler(fh)
     		logger.setLevel(Level.ALL)
     		fh.setFormatter(formatter)
     		// Parse the XML-TXM file
     		logger.log(Level.INFO, "Parsing XML-TXM file (may take a while).")
     		TEI = new XmlParser().parse(xmlTxmFile)
     		// Get text form from TEI header (if expressed)
     		logger.log(Level.INFO, "Reading TEI header to detect text form.")
     		if (TEI.teiHeader.profileDesc.textClass.catRef) {
     			def catId = TEI.teiHeader.profileDesc.textClass.catRef.'@target'[0][1..-1]
     			textForm = TEI.teiHeader.encodingDesc.'**'.category.find {
     				it.'@id' == catId || it.attribute(nsXml.id) == catId
     			}.catDesc[0].text()
     			logger.log(Level.INFO, "Found text form '$textForm' in TEI header.")
     		} else {
     			logger.log(Level.INFO, "No text form in TEI header, assuming '$textForm'.")
+    		}
     		// Convert Lbs to Property
     		if (textForm == 'vers' || textForm == 'mixte') {
     			logger.log(Level.INFO, "Converting <lb/> tags to a word property for later processing.")
     			def undone = TEI.text.body
     			def lastLb = false
     			def lastWord = null
     			while (undone) {
     				def aNode = undone.remove(0)
     				if (aNode.name().getLocalPart() == 'lb') {
     					lastLb = true
     					if (lastWord) {
     						lastWord.'@lb' = 'end'
+    					}
     				} else if (aNode.name().getLocalPart() == 'w') {
     					if (!punctTest(aNode)) {
     						if (lastLb) {
     							aNode.'@lb' = 'start'
+    						}
     						lastWord = aNode
     						lastLb = false
+    					}
     				} else {
     					undone.addAll(0, aNode.children().findAll { it.getClass() == Node })
+    				}
+    			}
+    		}
     		// Build node lists
     		logger.log(Level.INFO, "Building node lists.")
     		wNodes = TEI.text.'**'.findAll { it.name().getLocalPart() == 'w' }
     		wNodesInQ = TEI.text.'**'.q.'**'.findAll { it.name().getLocalPart() == 'w' }
+    	}
     	static main(def args) {
     		// Parse command line args
     		def outputFile = 'test-out.xml'
     		def logName = 'subcorpusDataInjectLog.log'
     		def cli = new CliBuilder(
     				usage:'SubcorpusDataInject.groovy [options] TXM_file.xml TS_file.xml'
+    				)
     		cli.h(longOpt:'help', 'Prints this message')
     		cli.m(longOpt:'master', 'Specifies input file is a TS master file.')
     		cli.v(longOpt:'verse', 'Sets default text form to verse. Overridden by TEI header.')
     		cli.l(longOpt:'logfile', args:1, argName:'logfile.log', 'Use given file for log.')
     		cli.o(longOpt:'output', args:1, argName:'outputfile.xml', 'Output to given file')
     		println args
     		def options = cli.parse(args)
     		// Call XML TXM loader
     		if (options.arguments().size() == 2) {
     			def arglist = [new File (options.arguments()[0])]
     			if (options.logfile) {
     				arglist.add(new FileHandler(options.logfile))
     			} else {
     				arglist.add(new FileHandler(logName))
+    			}
     			if (options.v) {
     				arglist.add('vers')
     			} else {
     				arglist.add('')
+    			}
     			def sdi = new SubcorpusDataInject(arglist[0], arglist[1], arglist[2])
     			// Call process master or process subcorpus
     			arglist = [new File(options.arguments()[1]).getCanonicalFile()]
     			if (options.output) {
     				arglist.add(new File (options.output))
     			} else {
     				arglist.add(new File (outputFile))
+    			}
     			if (options.m) {
     				sdi.processMaster(arglist[0], arglist[1])
     			} else {
     				sdi.processSubcorpus(arglist[0], arglist[1])
+    			}
     		} else {
     			println "Wrong number of arguments."
     			println cli.usage()
+    		}
+    	}
     	def processMaster (File masterFile, File outputFile) {
     		// Runs the transform on each subcorpus listed in a master file.
     		// Resaves the master file.
     		logger.log(Level.INFO, "Loading corpus master file")
     		def masterFolder = ''
     		def outputFolder = ''
     		if (masterFile.getParent()) {
     			// If not cwd
     			masterFolder = masterFile.getParent()
+    		}
     		if (outputFile.getParent()) {
     			// If not cwd
     			outputFolder = outputFile.getParent()
+    		}
     		def master = new XmlParser().parse(masterFile)
     		// Process each subcorpus file
     		master.body.subcorpus.each {
     			processSubcorpus(
     					new File (masterFolder, it.'@external'[5..-1]),
     					new File (outputFolder, "${it.'@external'[5..-5]}_pos.xml")
+    					)
     			it.'@external' = "${it.'@external'[0..-5]}_pos.xml"
+    		}
     		logger.log(Level.INFO, "Saving new master file ${outputFile.getName()}")
     		outputFile.withWriter { writer ->
     			writer << groovy.xml.XmlUtil.serialize(master)
+    		}
     		logger.log(Level.INFO, "Run reheader on this file!")
+    	}
     	def processSubcorpus (File subcorpusFile, File outputFile) {
     		// Parse Subcorpus file
     		logger.log(Level.INFO, "Loading subcorpus file ${subcorpusFile.getName()}")
     		def TS = new XmlParser().parse(subcorpusFile)
     		// This is used as a stack
     		def tNodes = TS.s.graph.terminals.t
     		// This isn't
     		def tNodesFixed = TS.s.graph.terminals.t
     		// Build duplicata correspondance list
     		logger.log(Level.INFO, "Building duplicata correspondance list")
     		def duplIdMatches = tNodes.findAll {it.'@word' == '*'}.collect { dupl ->
     			def duplId = dupl.'@id'
     			def realId = dupl.parent().parent().nonterminals.nt.edge.find {
     				it.'@idref' == duplId
     			}.parent().secedge.find {
     				it.'@label' == 'dupl'
     			}.'@idref'
     			[duplId, realId]
+    		}
     		logger.log(Level.INFO, "${duplIdMatches.size()} duplicatas found.")
     		// Add pos & form tags
     		logger.log(Level.INFO, "Adding PoS and form tags to each t-node")
     		// Get first wNode corresponding to a tNode
     		def i = 0
     		def wIx = null
     		while (i < tNodes.size()) {
     			// Avoid trying to find "fake" first words, it's very inefficient
     			if (tNodes[i].'@word' != '#') {
     				wIx = wNodes.indexOf(wNodes.find { idmatch(tNodes[i], it) })
+    			}
     			if (wIx) {
     				break
     			} else {
     				i++
+    			}
+    		}
     		assert wIx, "$i"
     		/*
     		 * MAIN ITERATOR BEGINS HERE FOR POS INJECTION
     		 */
     		// Iterate over tNodes.
     		while (tNodes) {
     			def thisTNode = tNodes.remove(0)
     			def thisTNodeId = thisTNode.'@editionId'
     			def wNode = null
     			def q = false
     			// Define the PoS / q / form injecting closure
     			def injectPos = {
     				// Q tag
     				if (q) {
     					thisTNode.'@q' = 'y'
     				} else {
     					thisTNode.'@q' = 'n'
+    				}
     				// Form tag
     				if (textForm == 'prose') {
     					thisTNode.'@form' = 'prose'
     				} else if (
     				textForm == 'vers' ||
+    				(
     				textForm == 'mixte'
     				&& wNode.parent().parent().name().getLocalPart() == 'ab'
+    				)
     				) {
     					if (! wNode.'@lb') {
     						thisTNode.'@form' = 'vers'
     					} else if (wNode.'@lb' == 'start') {
     						thisTNode.'@form' = 'vers_debut'
     					} else if (wNode.'@lb' == 'end')  {
     						thisTNode.'@form' = 'vers_fin'
+    					}
+    				}
     				// DEBUG TMR 22/08/2012
     				else {
     					thisTNode.'@form' = 'prose'
+    				}
     				// END DEBUG TMR 22/08/2012
     				// Find PoS tag in BFM file
     				// Try txm:ana tags
     				def txmAnaList = wNode[nsTxm.ana]
     				// Then interp tags
     				if (! txmAnaList) {
     					txmAnaList = wNode.interp
+    				}
     				// Try type="#pos_syn"
     				if (txmAnaList.find {it.'@type' == '#pos_syn'}) {
     					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos_syn'}.text()
     					// type attr of wNode
     				} else if (! txmAnaList && wNode.'@type') {
     					thisTNode.'@pos' = wNode.'@type'
     					// type="#pos"
     				} else if (txmAnaList.find { it.'@type' =  '#pos' }) {
     					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#pos'}.text()
     					// type="#fropos"
     				} else if (txmAnaList.find { it.'@type' =  '#fropos' }) {
     					thisTNode.'@pos' = txmAnaList.find {it.'@type' == '#fropos'}.text()
     				} else {
     					thisTNode.'@pos' = '--'
+    				}
+    			}
     			// End of injecting closure.
     			// Check whether wNode matches tNode
     			if (thisTNode.'@word' == '#') {
     				// Not expecting to find these in BFM file, all OK.
     				thisTNode.'@pos' = '--'
     				thisTNode.'@form' = '--'
     				thisTNode.'@q' = '--'
     			} else if ( idmatch(thisTNode, wNodes[wIx]) ) {
     				// Perfect match
     				wNode = wNodes[wIx]
     				q = wNodesInQ.contains(wNode)
     				injectPos.call()
     				wIx++
     			} else if (thisTNode.'@word' == '*') {
     				// Duplicata, use dupl match list to find correct wNode
     				def duplIdMatch = duplIdMatches.find { it[0] == thisTNode.'@id' }
     				if (duplIdMatch) {
     					def matchTNode = tNodesFixed.find { it.'@id' == duplIdMatch[1] }
     					// To enhance performance, I assume initially that the wNode matching the
     					// duplicata is within +- 10 words of the current position in the
     					// list.  The wNodes list is very long, and searches over the entire
     					// list are avoided.
     					assert matchTNode, "$duplIdMatch"
     					if (wIx + 10 < wNodes.size() && wIx - 10 > 0) {
     						wNode = wNodes[wIx-10 .. wIx+10].find {
     							idmatch(matchTNode, it)
+    						}
+    					}
     					// Use whole list if it failed.
     					if (! wNode) {
     						wNode = wNodes.find {
     							idmatch(matchTNode, it)
+    						}
+    					}
     					q = wNodesInQ.contains(wNode)
     					injectPos.call()
     					// Don't update wIx value!
     				} else {
     					thisTNode.'@pos' = '--'
     					thisTNode.'@form' = '--'
     					thisTNode.'@q' = '--'
     					logger.log(
     							Level.WARNING,
     							"<w/> node matching TS duplicata $thisTNodeId not found."
+    							)
+    				}
     			}  else if (punctTest(wNodes[wIx])) {
     				// not expecting to find these in TS file; try same tNode again.
     				tNodes.add(0, thisTNode)
     				wIx++
     			} else if (!
     			wNodes.find {
     				idmatch (thisTNode, it)
+    			}
     			) {
     				// Check the SRCMF corpus hasn't acquired an extra word
     				logger.log(
     						Level.WARNING,
     						"TS word $thisTNodeId (${thisTNode.'@word'}) is " +
     						"absent from BFM file."
+    						)
     				thisTNode.'@pos' = '--'
     				thisTNode.'@form' = '--'
     				thisTNode.'@q' = '--'
     			} else {
     				// it's doesn't match, it's not a duplicata,
     				// it's not BFM punctuation, but it IS found somewhere in the BFM
     				// corpus.
     				logger.log(
     						Level.WARNING,
     						"Discontinuity in TS corpus before word $thisTNodeId " +
     						"(${thisTNode.'@word'})."
+    						)
     				wNode = wNodes.find {
     					idmatch(thisTNode, it)
+    				}
     				q = wNodesInQ.contains(wNode)
     				injectPos.call()
     				wIx = wNodes.indexOf(wNode) + 1
+    			}
+    		}
     		/*
     		 * End Main iteration
     		 */
     		/*******************
     		 * Recalculate headpos
     		 */
     		logger.log(Level.INFO, "Recalculating headpos.")
     		TS.s.graph.nonterminals.nt.each { nt ->
     			def a = nt.edge.findAll { it.'@label' == 'L' }
     			if (a.size() == 0) {
     				nt.'@headpos' = '--'
     			} else if (a.size() == 1) {
     				nt.'@headpos' = nt.parent().parent().terminals.t.find {
     					it.'@id' == a[0].'@idref'
     				}.'@pos'
     			} else {
     				for (def headpos : ['NOM', 'PRO', 'ADJ', 'VER', 'ADV', 'CON', 'PRE', 'DET']) {
     					if (
     					a.each { edge ->
     						nt.parent().parent().terminals.t.find {
     							it.'@id' == edge.'@idref'
     						}.'@pos'
     					}.contains(headpos)
     					) {
     						nt.'@headpos' = headpos + '?'
     						break
+    					}
+    				}
+    			}
     			assert nt.'@headpos'
+    		}
     		logger.log(Level.INFO, "Saving new subcorpus file ${outputFile.getName()}")
     		outputFile.withWriter { writer ->
     			writer << groovy.xml.XmlUtil.serialize(TS)
+    		}
+    	}
     	// Subroutine self-test to establish whether a Groovy node is punctuation
     	private punctTest(gNode) {
     		if (gNode.'@type') {
     			gNode.'@type'.toLowerCase().startsWith('pon')
     		} else {
     			if (gNode[nsTxm.ana]) {
     				return gNode[nsTxm.ana].find {it.text().toLowerCase().startsWith('pon')} != null
     			} else if (gNode.interp) {
     				return gNode.interp.find {it.text().toLowerCase().startsWith('pon')} != null
     			} else return false
+    		}
+    	}
     	private idmatch (tNode, wNode) {
     		// Matches a TS node with a <w/> node
     		def idMatch = false
     		def wNodeId = wNode.'@id'
     		if (! wNodeId) {
     			wNodeId = wNode.attribute(nsXml.id)
+    		}
     		def tNodeId = tNode.'@editionId'
     		def regex = /(\d+)([_a-z]+)?$/
     		def m1 = (tNodeId =~ regex)
     		def m2 = (wNodeId =~ regex)
     		if ( m1.find() && m2.find() ) {
     			idMatch = (m1.group(1).toInteger() == m2.group(1).toInteger())
     		} else {
     			logger.log(
     					Level.WARNING,
     					"Idmatch routine can't find integer part of one or both of the" +
     					"paired Ids for nodes $tNode, $wNode"
+    					)
+    		}

Laboratoire ICAR » Plateforme TXM

Révision 911