/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 2037

     // STANDARD DECLARATIONS
     package org.txm.macro.urs.democrat
     import java.nio.charset.Charset
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import net.sf.saxon.functions.IndexOf
     import org.txm.importer.StaxIdentityParser
     import org.txm.objects.*
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.utils.CsvReader
     import org.txm.utils.io.IOUtils
     import org.txm.scripts.importer.XPathResult
     import javax.xml.xpath.XPathConstants
     class Metadata2TEiHeader {
     	int debug = 0;
     	public Metadata2TEiHeader(int debug) {
     		this.debug = debug
+    	}
     	//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml")
     	//def teiHeaderTemplateFile
     	//
     	//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
     	//def xpathFile
     	//
     	//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
     	//def metadataFile
     	//
     	//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
     	//debug
     	//
     	//if (!ParametersDialog.open(this)) return;
     	//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
     	//
     	//def xpathProperties = new Properties()
     	//xpathProperties.load(IOUtils.getReader(xpathFile))
     	//println xpathProperties
     	//
     	//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
     	//println csvReader
     	//
     	//csvReader.readHeaders();
     	//def header = csvReader.getHeaders()
     	//if (!header.contains("id")) {
     	//	println "** 'id' column not found in $metadataFile header=$header"
     	//	return;
     	//}
     	//if (!header.contains("corpus_id")) {
     	//	println "** 'corpus_id' column not found in $metadataFile header=$header"
     	//	return;
     	//}
     	//
     	//def corpora = [:]
     	//for (Project project : Workspace.getInstance().getProjects()) {
     	//	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
     	//		corpora[corpus.getID()] = corpus;
     	//	}
     	//}
     	//
     	//while (csvReader.readRecord())	{
     	//	String text_id = csvReader.get("id")
     	//	if (text_id == null || text_id.length() == 0) {
     	//		//println "** @id not found for record="+csvReader.getRawRecord()
     	//		continue;
     	//	}
     	//
     	//	String corpus_id = csvReader.get("corpus_id")
     	//	if (corpus_id == null || corpus_id.length() == 0) {
     	//		println "** @corpus_id not found for record="+csvReader.getRawRecord()
     	//		continue;
     	//	}
     	//
     	//	MainCorpus corpus = corpora[corpus_id]
     	//	if (corpus == null) {
     	//		println "** no corpus found for ID=$corpus_id"
     	//		continue;
     	//	}
     	//	def project = corpus.getProject()
     	//
     	//	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
     	//
     	//	if (!txmDir.exists()) {
     	//		println "** the selected corpus has no XML-TXM files. Aborting."
     	//	}
     	//
     	//
     	//	File txmFile = new File(txmDir, text_id+".xml")
     	//	if (txmFile.exists()) {
     	//
     	//		def data = [:]
     	//
     	//		for (def h : header) data[h] = csvReader.get(h)
     	//
     	//		String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
     	//		if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
     	//			injecting(txmFile, xmlteiHeaderContent)
     	//		} else {
     	//			println "** Text header not updated: $txmFile"
     	//		}
     	//	} else {
     	//		println "** Text not found: $txmFile"
     	//	}
     	//}
     	def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) {
     		XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile);
     		for (String info : data.keySet()) {
     			String xpath = xpathProperties[info];
     			String value = data[info]
     			if (xpath == null) {
     				continue; // not a data to inject
+    			}
     			if (debug > 1) println "   injecting '$info'='$value' in '$xpath'"
     			def expr = xpathProcessor.xpath.compile(xpath);
     			def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
     			if (first != null) {
     				switch (first.getNodeType()) {
     					case 1: // element
     						if (debug > 1) println "   patching: $info with "+value
     						def newChild = xpathProcessor.doc.createElement(first.getTagName())
     						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
     						def attributes = first.getAttributes();
     						for (int i = 0 ; i < attributes.getLength() ; i++) { // copy attributes
     							def attr = attributes.item(i)
     							newChild.setAttribute(attr.getNodeName(), attr.getNodeValue())
+    						}
     						first.getParentNode().replaceChild(newChild, first)
     						break;
     					case 2: // attribute
     						if (debug > 1) println "   patching attribute: $info with "+value
     						first.setNodeValue(value)
     						break;
     					case 3: // text
     						if (debug > 1) println "   patching text: $info with "+value
     						first.setNodeValue(value)
     						break;
     					default:
     						break
+    				}
     			} else {
     				//try finding parent and add a new node
     				String attribute_xpath = null;
     				if (xpath.lastIndexOf("/@") > 0) {
     					attribute_xpath = xpath.substring(xpath.lastIndexOf("/@")+2)
     					xpath = xpath.substring(0, xpath.lastIndexOf("/@"))
+    				}
     				String parent_xpath = xpath.substring(0, xpath.lastIndexOf("/"))
     				String element_xpath = xpath.substring(xpath.lastIndexOf("/")+1)
     				def parent_expr = xpathProcessor.xpath.compile(parent_xpath);
     				def parent = parent_expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
     				if (parent != null) {
     					String namespace = element_xpath.substring(element_xpath.indexOf(":"))
     					String nodename = element_xpath.substring(element_xpath.indexOf(":")+1)
     					def attributes = [:]
     					if (nodename.contains("[")) {
     						String attributesString = nodename.substring(nodename.indexOf("[")+1, nodename.length()-1)
     						for (String attributeString : attributesString.split(",")) {
     							def split = attributeString.split("=")
     							attributes[split[0].substring(1)] = split[1].substring(1, split[1].length()-1) // remove @ and remove ""
+    						}
     						nodename = nodename.substring(0, nodename.indexOf("["))
+    					}
     					def newChild = xpathProcessor.doc.createElement(nodename)
     					for (String attributeName : attributes.keySet()) {
     						newChild.setAttribute(attributeName, attributes[attributeName])
+    					}
     					if (attribute_xpath != null) {
     						newChild.setAttribute(attribute_xpath, value)
     					} else {
     						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
+    					}
     					if (debug > 1) println "  creating node $parent_xpath / $nodename[$attributes] : $attribute_xpath = $value"
     					parent.appendChild(newChild)
     				} else {
     					println " ** info=$info not found or created for "+xpathProperties[info]
+    				}
+    			}
+    		}
     		//	return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
     		//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
     		String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0)
     		content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
+    	}
     	/**
     	 * Replace the teiHeader
     	 * @param txmFile
     	 * @param data
     	 * @param paths
     	 * @param xmlteiHeaderContent
     	 * @return
     	 */
     	def replaceHeader(File txmFile, String xmlteiHeaderContent) {
     		println " editing: $txmFile..."
     		StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
     					boolean start = false;
     					public void processStartElement() {
     						if (localname == "teiHeader") {
     							start = true;
     							if (debug > 2) println "   replacing teiHeader"
     							if (debug > 3) println "       with $xmlteiHeaderContent"
     							output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8")));  // REPLACE CONTENT !
+    						}
     						if (!start) {
     							super.processStartElement();
+    						}
+    					}
     					public void processEndElement() {
     						if (!start) {
     							super.processEndElement();
+    						}
     						if (localname == "teiHeader") {
     							start = false;
     							if (debug > 2) println "   replace done"
+    						}
+    					}
     					public void processCharacters() {
     						if (!start) {
     							super.processCharacters();
+    						}
+    					}
     					public void processComment() {
     						if (!start) {
     							super.processComment();
+    						}
+    					}
+    				}
     		File tmpDirectory = new File("/home/mdecorde/TEMP");
     		File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName())
     		File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName())
     		if (sparser.process(outfile)) { // replace inputFile
     			if (txmFile.renameTo(copyFile)) {
     				if (outfile.renameTo(txmFile)) {
     					println " -> SUCCESS see $txmFile"
     					return true
     				} else {
     					println " -> FAIL could not replace $txmFile"
     					println " -> see result in $outfile"
+    				}
     			} else {
     				println " -> FAIL could not make a copy of $txmFile in $copyFile"
     				println " -> see result in $outfile"
+    			}
     		} else {
     			println " -> FAIL see $outfile"
+    		}
     		return false;
+    	}
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro.urs.democrat
     import java.nio.charset.Charset
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import net.sf.saxon.functions.IndexOf
     import org.txm.importer.StaxIdentityParser
     import org.txm.objects.*
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.utils.CsvReader
     import org.txm.utils.io.IOUtils
     import org.txm.scripts.importer.XPathResult
     import javax.xml.xpath.XPathConstants
     class Metadata2TEIHeader {
     	int debug = 0;
     	public Metadata2TEIHeader(int debug) {
     		this.debug = debug
+    	}
     	//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml")
     	//def teiHeaderTemplateFile
     	//
     	//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
     	//def xpathFile
     	//
     	//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
     	//def metadataFile
     	//
     	//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
     	//debug
     	//
     	//if (!ParametersDialog.open(this)) return;
     	//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
     	//
     	//def xpathProperties = new Properties()
     	//xpathProperties.load(IOUtils.getReader(xpathFile))
     	//println xpathProperties
     	//
     	//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
     	//println csvReader
     	//
     	//csvReader.readHeaders();
     	//def header = csvReader.getHeaders()
     	//if (!header.contains("id")) {
     	//	println "** 'id' column not found in $metadataFile header=$header"
     	//	return;
     	//}
     	//if (!header.contains("corpus_id")) {
     	//	println "** 'corpus_id' column not found in $metadataFile header=$header"
     	//	return;
     	//}
     	//
     	//def corpora = [:]
     	//for (Project project : Workspace.getInstance().getProjects()) {
     	//	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
     	//		corpora[corpus.getID()] = corpus;
     	//	}
     	//}
     	//
     	//while (csvReader.readRecord())	{
     	//	String text_id = csvReader.get("id")
     	//	if (text_id == null || text_id.length() == 0) {
     	//		//println "** @id not found for record="+csvReader.getRawRecord()
     	//		continue;
     	//	}
     	//
     	//	String corpus_id = csvReader.get("corpus_id")
     	//	if (corpus_id == null || corpus_id.length() == 0) {
     	//		println "** @corpus_id not found for record="+csvReader.getRawRecord()
     	//		continue;
     	//	}
     	//
     	//	MainCorpus corpus = corpora[corpus_id]
     	//	if (corpus == null) {
     	//		println "** no corpus found for ID=$corpus_id"
     	//		continue;
     	//	}
     	//	def project = corpus.getProject()
     	//
     	//	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
     	//
     	//	if (!txmDir.exists()) {
     	//		println "** the selected corpus has no XML-TXM files. Aborting."
     	//	}
     	//
     	//
     	//	File txmFile = new File(txmDir, text_id+".xml")
     	//	if (txmFile.exists()) {
     	//
     	//		def data = [:]
     	//
     	//		for (def h : header) data[h] = csvReader.get(h)
     	//
     	//		String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
     	//		if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
     	//			injecting(txmFile, xmlteiHeaderContent)
     	//		} else {
     	//			println "** Text header not updated: $txmFile"
     	//		}
     	//	} else {
     	//		println "** Text not found: $txmFile"
     	//	}
     	//}
     	def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) {
     		XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile);
     		for (String info : data.keySet()) {
     			String xpath = xpathProperties[info];
     			String value = data[info]
     			if (xpath == null) {
     				continue; // not a data to inject
+    			}
     			if (debug > 1) println "   injecting '$info'='$value' in '$xpath'"
     			def expr = xpathProcessor.xpath.compile(xpath);
     			def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
     			if (first != null) {
     				switch (first.getNodeType()) {
     					case 1: // element
     						if (debug > 1) println "   patching: $info with "+value
     						def newChild = xpathProcessor.doc.createElement(first.getTagName())
     						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
     						def attributes = first.getAttributes();
     						for (int i = 0 ; i < attributes.getLength() ; i++) { // copy attributes
     							def attr = attributes.item(i)
     							newChild.setAttribute(attr.getNodeName(), attr.getNodeValue())
+    						}
     						first.getParentNode().replaceChild(newChild, first)
     						break;
     					case 2: // attribute
     						if (debug > 1) println "   patching attribute: $info with "+value
     						first.setNodeValue(value)
     						break;
     					case 3: // text
     						if (debug > 1) println "   patching text: $info with "+value
     						first.setNodeValue(value)
     						break;
     					default:
     						break
+    				}
     			} else {
     				//try finding parent and add a new node
     				String attribute_xpath = null;
     				if (xpath.lastIndexOf("/@") > 0) {
     					attribute_xpath = xpath.substring(xpath.lastIndexOf("/@")+2)
     					xpath = xpath.substring(0, xpath.lastIndexOf("/@"))
+    				}
     				String parent_xpath = xpath.substring(0, xpath.lastIndexOf("/"))
     				String element_xpath = xpath.substring(xpath.lastIndexOf("/")+1)
     				def parent_expr = xpathProcessor.xpath.compile(parent_xpath);
     				def parent = parent_expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
     				if (parent != null) {
     					String namespace = element_xpath.substring(element_xpath.indexOf(":"))
     					String nodename = element_xpath.substring(element_xpath.indexOf(":")+1)
     					def attributes = [:]
     					if (nodename.contains("[")) {
     						String attributesString = nodename.substring(nodename.indexOf("[")+1, nodename.length()-1)
     						for (String attributeString : attributesString.split(",")) {
     							def split = attributeString.split("=")
     							attributes[split[0].substring(1)] = split[1].substring(1, split[1].length()-1) // remove @ and remove ""
+    						}
     						nodename = nodename.substring(0, nodename.indexOf("["))
+    					}
     					def newChild = xpathProcessor.doc.createElement(nodename)
     					for (String attributeName : attributes.keySet()) {
     						newChild.setAttribute(attributeName, attributes[attributeName])
+    					}
     					if (attribute_xpath != null) {
     						newChild.setAttribute(attribute_xpath, value)
     					} else {
     						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
+    					}
     					if (debug > 1) println "  creating node $parent_xpath / $nodename[$attributes] : $attribute_xpath = $value"
     					parent.appendChild(newChild)
     				} else {
     					println " ** info=$info not found or created for "+xpathProperties[info]
+    				}
+    			}
+    		}
     		//	return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
     		//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
     		String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0)
     		content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
+    	}
     	/**
     	 * Replace the teiHeader
     	 * @param txmFile
     	 * @param data
     	 * @param paths
     	 * @param xmlteiHeaderContent
     	 * @return
     	 */
     	def replaceHeader(File txmFile, String xmlteiHeaderContent) {
     		println " editing: $txmFile..."
     		StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
     					boolean start = false;
     					public void processStartElement() {
     						if (localname == "teiHeader") {
     							start = true;
     							if (debug > 2) println "   replacing teiHeader"
     							if (debug > 3) println "       with $xmlteiHeaderContent"
     							output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8")));  // REPLACE CONTENT !
+    						}
     						if (!start) {
     							super.processStartElement();
+    						}
+    					}
     					public void processEndElement() {
     						if (!start) {
     							super.processEndElement();
+    						}
     						if (localname == "teiHeader") {
     							start = false;
     							if (debug > 2) println "   replace done"
+    						}
+    					}
     					public void processCharacters() {
     						if (!start) {
     							super.processCharacters();
+    						}
+    					}
     					public void processComment() {
     						if (!start) {
     							super.processComment();
+    						}
+    					}
+    				}
     		File tmpDirectory = new File("/home/mdecorde/TEMP");
     		File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName())
     		File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName())
     		if (sparser.process(outfile)) { // replace inputFile
     			if (txmFile.renameTo(copyFile)) {
     				if (outfile.renameTo(txmFile)) {
     					println " -> SUCCESS see $txmFile"
     					return true
     				} else {
     					println " -> FAIL could not replace $txmFile"
     					println " -> see result in $outfile"
+    				}
     			} else {
     				println " -> FAIL could not make a copy of $txmFile in $copyFile"
     				println " -> see result in $outfile"
+    			}
     		} else {
     			println " -> FAIL see $outfile"
+    		}
     		return false;
+    	}
+    }

     	println " compute measures..."
     	def data = new ComputeDemocratMeasureMetadata().getStats(corpus, xpathProperties)
     	Metadata2TEiHeader mthm = new Metadata2TEiHeader(debug);
     	Metadata2TEIHeader mthm = new Metadata2TEIHeader(debug);
     	for (def h : header) data[h] = csvReader.get(h)
     	println " creating teiHeader..."

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 2037