/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 2051

     import org.txm.core.engines.EngineType;
     import org.txm.core.messages.TXMCoreMessages;
     import org.txm.core.results.TXMResult;
     import org.txm.importer.cwb.PatchCwbRegistry;
     import org.txm.libs.cqp.CQPLibPreferences;
     import org.txm.objects.CorpusBuild;
     import org.txm.objects.Match;
     import org.txm.objects.Project;
     import org.txm.searchengine.core.EmptySelection;
     import org.txm.searchengine.core.Query;
     import org.txm.searchengine.core.SearchEngine;
-...
     import org.txm.searchengine.cqp.corpus.query.CQLQuery;
     import org.txm.utils.OSDetector;
     import org.txm.utils.Sh;
     import org.txm.utils.io.IOUtils;
     import org.txm.utils.logger.Log;
     public class CQPSearchEngine extends SearchEngine {

     // Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
     // Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
     // @author sheiden
     package org.txm.macro.commands
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.Toolbox
     import org.txm.macro.cqp.CQPUtils
     import org.txm.searchengine.cqp.ICqiClient
     import org.odftoolkit.simple.SpreadsheetDocument
     import org.odftoolkit.odfdom.doc.table.OdfTable
     import org.odftoolkit.odfdom.doc.table.OdfTableRow
     import org.odftoolkit.odfdom.doc.table.OdfTableCell
     import org.odftoolkit.simple.table.Table
     import org.odftoolkit.simple.Document
     import org.odftoolkit.simple.style.Border
     import org.odftoolkit.simple.style.Font
     import org.odftoolkit.simple.style.StyleTypeDefinitions
     import org.odftoolkit.simple.style.StyleTypeDefinitions.FontStyle
     import org.odftoolkit.simple.table.Row
     import org.odftoolkit.simple.table.Cell
     import org.odftoolkit.simple.style.StyleTypeDefinitions.CellBordersType
     import org.odftoolkit.simple.style.StyleTypeDefinitions.HorizontalAlignmentType
     // BEGINNING OF PARAMETERS
     @Field @Option(name="inputFile", usage="input sequence list file (one sequence per line)", widget="File", required=true, def="")
     def inputFile
     @Field @Option(name="inputIsCQL", usage="input sequences are CQL queries", widget="Boolean", required=true, def="false")
     def inputIsCQL
     @Field @Option(name="inputWordProperty", usage="implicit word property to use", widget="String", required=true, def="word")
     String inputWordProperty
     @Field @Option(name="outputFile", usage="results output File (TSV format)", widget="File", required=true, def="")
     def outputFile
     @Field @Option(name="odsOutputFormat", usage="use ODS as output format", widget="Boolean", required=true, def="false")
     def odsOutputFormat
     @Field @Option(name="outputWordProperty", usage="output word property to use", widget="String", required=true, def="word")
     String outputWordProperty
     @Field @Option(name="groupByQuery", usage="provide frequency of each CQL match values or just the total frequency of each CQL match", widget="Boolean", required=true, def="false")
     def groupByQuery
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return
     // END OF PARAMETERS
     def scriptName = this.class.getSimpleName()
     if (!(inputFile && inputFile.isFile() && inputFile.canRead() )) {
     	println "** $scriptName: impossible to read input file. Aborting."
     	return false
+    }
     corpusEngine = CQPSearchEngine.getCqiClient()
     utils = new CQPUtils()
     corpora = utils.getCorpora(this)
     if ((corpora == null) || corpora.size() == 0) {
     	println "** $scriptName: please select a corpus in the Corpus view or provide a corpus name. Aborting."
     	return false
+    }
     def getComputerName()
+    {
     	env = System.getenv()
     	if (env.containsKey("COMPUTERNAME"))
     		return env.get("COMPUTERNAME")
     	else if (env.containsKey("HOSTNAME"))
     		return env.get("HOSTNAME")
     	else
     		return InetAddress.getLocalHost().getHostName()
+    }
     def fullName = outputFile.getName()
     def i = fullName.lastIndexOf(".")
     def name
     if (i == -1) {
     	name = fullName
     } else {
     	name = fullName.substring(0, i)
+    }
     if (odsOutputFormat) {
     	try {
     		 Class.forName( "org.odftoolkit.simple.SpreadsheetDocument")
     	} catch(e) {
     		 println "** $scriptName: the simple-odf-0.8.1-incubating.jar file is needed in the \$HOME/TXM/scripts/lib directory to write in ODS format. Please uncheck the odsOutputFormat option. Aborting."
     		return false
+    	}
     	outputFile = new File(outputFile.getParentFile(), name+".ods")
     	ods = SpreadsheetDocument.newSpreadsheetDocument()
     	table = ods.getTableByName("Sheet1")
     	font = new Font("Arial", FontStyle.BOLD, 11)
     	properties = Table.newTable(ods);
     		properties.setTableName("Properties")
     	setStringCellBold(properties, 0, 0, "Name")
     	setStringCellBold(properties, 0, 1, "Value")
     	line = 1
     	setStringCell2(properties, line++, 0, "date", new Date().format("dd/MM/yyyy"))
     	setStringCell2(properties, line++, 0, "time", new Date().format("HH'h'mm"))
     	setStringCell2(properties, line++, 0, "machine", getComputerName())
     	setStringCell2(properties, line++, 0, "user", System.getProperty("user.name"))
     	setStringCell2(properties, line++, 0, "macro", scriptName)
     	setStringCell2(properties, line++, 0, "corpora", corpora.collect { it.getName() }.join(","))
     	setStringCell2(properties, line++, 0, "inputFile", inputFile.getAbsolutePath())
     	setStringCell2(properties, line++, 0, "inputIsCQL", inputIsCQL.toString())
     	setStringCell2(properties, line++, 0, "inputWordProperty", inputWordProperty)
     	setStringCell2(properties, line++, 0, "outputFile", outputFile.getAbsolutePath())
     	setStringCell2(properties, line++, 0, "odsOutputFormat", odsOutputFormat.toString())
     	setStringCell2(properties, line++, 0, "outputWordProperty", outputWordProperty)
     	setStringCell2(properties, line++, 0, "groupByQuery", groupByQuery.toString())
     } else {
     	outputFile = new File(outputFile.getParentFile(), name+".tsv")
     	output = new FileWriter(outputFile)
+    }
     def setRow(table, row, corpus, string, frequency) {
     	cell=table.getCellByPosition(0, row)
     	cell.setStringValue(corpus)
     	cell=table.getCellByPosition(1, row)
     	cell.setStringValue(string)
     	cell=table.getCellByPosition(2, row)
     	cell.setDoubleValue(frequency)
+    }
     def setRow2(table, row, corpus, string1, string2, frequency) {
     	cell=table.getCellByPosition(0, row)
     	cell.setStringValue(corpus)
     	cell=table.getCellByPosition(1, row)
     	cell.setStringValue(string1)
     	cell=table.getCellByPosition(2, row)
     	cell.setStringValue((frequency == 0)?'':string2)
     	cell=table.getCellByPosition(3, row)
     	cell.setDoubleValue(frequency)
+    }
     def setStringCell(table, row, col, string) {
     	cell=table.getCellByPosition(col, row)
     	cell.setStringValue(string)
     	return cell
+    }
     def setStringCellBold(table, row, col, string) {
     	cell=table.getCellByPosition(col, row)
     	cell.setStringValue(string)
     	cell.setFont(font)
     	return cell
+    }
     def setStringCell2(table, row, col, string1, string2) {
     	cell=table.getCellByPosition(col, row)
     	cell.setStringValue(string1)
     	cell=table.getCellByPosition(col+1, row)
     	cell.setStringValue(string2)
     	return cell
+    }
     if (odsOutputFormat) {
     	if (groupByQuery) {
     		setStringCellBold(table, 0, 0, "corpus")
     		setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query")
     		cell = setStringCellBold(table, 0, 2, "f")
     		cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT)
     	} else {
     		setStringCellBold(table, 0, 0, "corpus")
     		setStringCellBold(table, 0, 1, inputIsCQL?"CQL":outputWordProperty+" query")
     		setStringCellBold(table, 0, 2, outputWordProperty+" values")
     		cell = setStringCellBold(table, 0, 3, "f")
     		cell.setHorizontalAlignment(HorizontalAlignmentType.RIGHT)
+    	}
     } else {
     	if (groupByQuery) {
     		output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\tf"
     	} else {
     		output.println "corpus\t"+(inputIsCQL?"CQL":outputWordProperty+" query")+"\t"+(outputWordProperty+" values")+"\tf"
+    	}
+    }
     row = 1
     corpora.each { corpus ->
     	corpusName = corpus.getName()
     	seqN = 0
     	inputFile.eachLine() { line ->
     		cql = ""
     		subCorpus = "NIndex"+seqN++
     		if (line.length() > 0) {
     			if (inputIsCQL) {
     				cql = line
     			} else {
     				cql = "[$inputWordProperty=\""+line.replaceAll("\\p{Blank}+", "\"] [$inputWordProperty=\"",)+"\"]"
+    			}
     			corpusEngine.cqpQuery(corpusName, "$subCorpus", cql)
     			nmatches = corpusEngine.subCorpusSize("$corpusName:$subCorpus")
     			if (nmatches == 0) {
     				if (groupByQuery) {
     					if (odsOutputFormat) {
     						setRow(table, row++, corpusName, line, 0)
     					} else {
     						output.println corpusName+"\t"+line+"\t0"
+    					}
     				} else {
     					if (odsOutputFormat) {
     						setRow2(table, row++, corpusName, line, line, 0)
     					} else {
     						output.println corpusName+"\t"+line+"\t\t0"
+    					}
+    				}
     			} else {
     				if (groupByQuery) {
     					if (odsOutputFormat) {
     						setRow(table, row++, corpusName, line, nmatches)
     					} else {
     						output.println corpusName+"\t"+line+"\t$nmatches"
+    					}
     				} else {
     					starts = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCH, 0, nmatches-1)
     					ends = corpusEngine.dumpSubCorpus("$corpusName:$subCorpus", ICqiClient.CQI_CONST_FIELD_MATCHEND, 0, nmatches-1)
     					[starts, ends].transpose().collect {
     						corpusEngine.cpos2Str("$corpusName.$outputWordProperty", (it[0]..it[1]) as int[]).join(' ')
     					}.countBy { it }.sort { -it.value }.each {
     						if (odsOutputFormat) {
     							setRow2(table, row++, corpusName, line, it.key, it.value)
     						} else {
     							output.println corpusName+"\t"+line+"\t"+it.key+"\t"+it.value
+    						}
+    					}
+    				}
+    			}
+    		}
+    	}
+    }
     if (odsOutputFormat) {
     	ods.save(outputFile)
     } else {
     	output.close()
+    }
     return true

     // Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
     // Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
     // @author sheiden
     package org.txm.macro.cqp
     import org.txm.searchengine.cqp.corpus.*
     def getCorpusByName(name) {
     	def cl = CorpusManager.getCorpusManager().getCorpora()
     	def rc = cl.find {
     		it.getName() == name
+    	}
     	if (rc) return rc
     	def rsc = null
     	rc = cl.find { c ->
     		def scl = c.getSubcorpora()
     		def rrsc = scl.find { sc ->
     			sc.getName() == name
+    		}
     		rsc = rrsc
+    	}
     	return rsc
+    }
     def getCorpora(def script) {
     	def scriptName = this.class.getSimpleName()
     	if (script.binding.variables["args"]) {
     		def args = script.binding.variables["args"]
     		if (args["corpus"]) {
     			return [getCorpusByName(args["corpus"])]
     		} else if (args["corpora"]) {
     			return args["corpora"].tokenize(',').collect {
     				return getCorpusByName(it)
+    			}
     		} else {
     			if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) {
     				return null
+    			}
     			return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus }
+    		}
     	} else {
     		if (script.corpusViewSelection == null || !(script.corpusViewSelection instanceof Subcorpus || script.corpusViewSelection instanceof MainCorpus)) {
     			return null
+    		}
     		return script.corpusViewSelections.findAll { it instanceof Subcorpus || it instanceof MainCorpus }
+    	}
+    }
     def newTestSubCorpusName(name) {
     	def cl = CorpusManager.getCorpusManager().getCorpora()
     	def rc = cl.find {
     		it.getName() == name
+    	}
     	if (!rc) {
     		println "** corpus '"+name+"' not found."
     		return null
+    	}
     	def scl = rc.getSubcorpora()
     	def n = 1
     	def loop = true
     	while (loop) {
     		def rsc = scl.find { sc ->
     			sc.getName() == "CORPUS"+n
+    		}
     		if (rsc) n++ else loop = false
+    	}
     	return "CORPUS"+n
+    }
     def positions2cql(matches) {
     	// [0],[1],[2-3],[4-5],[6]
     	// 0,1,2-3,4-5,6
     	// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)"
     	def singletons = []
     	def intervals = [:]
     	matches.tokenize(',').each { interval ->
     		if (interval.indexOf('-') != -1) { // start-end
     			(start, end) = interval.tokenize('-') as int[]
     			l = end-start
     			if (intervals[l]) {
     				intervals.put(l, intervals[l] << start)
     			} else {
     				intervals.put(l, [start])
+    			}
     		} else { // singleton
     			singletons << interval.toInteger()
+    		}
+    	}
     	def singletonPositions = ""
     	def intervalsCQLs = ""
     	def intervalsPositions = ""
     	// "(n1:[])|(n2:[] []{1}) :: (n1 = 0)|(n1 = 1)|(n1 = 6)|(n2 = 2)|(n2 = 4)"
     	def firstSingleton = true
     	singletons.each {
     		if (!firstSingleton) singletonPositions += "|"
     		firstSingleton = false
     		singletonPositions += "n1="+it+""
+    	}
     	def firstInterval = true
     	intervals.each { length, positions ->
     		if (!firstInterval) intervalsCQLs += "|"
     		if (!firstInterval) intervalsPositions += "|"
     		firstInterval = false
     		intervalsCQLs += "n"+(length+1)+":[][]{"+length+"}"
     		def firstPosition = true
     		positions.each {
     			if (!firstPosition) intervalsPositions += "|"
     			firstPosition = false
     			intervalsPositions += "n"+(length+1)+"="+it
+    		}
+    	}
     	def q = ""
     	if (singletons.size() > 0) {
     		q += "n1:[]"
+    	}
     	if (singletons.size() > 0 && intervals.size() > 0) {
     		q += "|"
+    	}
     	if (intervals.size() > 0) {
     		q += intervalsCQLs
+    	}
     	q += "::"
     	if (singletons.size() > 0) {
     		q += singletonPositions
+    	}
     	if (singletons.size() > 0 && intervals.size() > 0) {
     		q += "|"
+    	}
     	if (intervals.size() > 0) {
     		q += intervalsPositions
+    	}
     	return q
+    }
     def corpus2positions(def corpus, def maxMatches, def pretty = false) {
     	def res = ""
     	def matches = corpus.getMatches()
     	if (matches.size() == 0) {
     		return "<empty>"
     	} else {
     		def first = 1
     		matches.take(maxMatches).each {
     			if (first) { first = 0 } else {
     				if (pretty) {
     					res += ", "
     				} else {
     					res += ","
+    				}
+    			}
     			// res += "["
     			if (it.getStart() < it.getEnd()) {
     				res += it.getStart()+"-"+it.getEnd()
     			} else {
     				res += it.getStart()
+    			}
     			// res += "]"
+    		}
+    	}
     	if (matches.size() > maxMatches) {
     		res += "..."
+    	}
     	return res
+    }
     methods = this.getClass().declaredMethods.findAll { !it.synthetic }.name.sort().unique()
     methods.removeAll(['main', 'run'])
     println "** CQPUtils is not meant to be called directly.\nPlease import its methods ${methods}\nwith the following declaration:\nimport org.txm.macro.cqp.CQPUtilsMacro"
     /*
     	  ___           ___           ___           ___           ___           ___
     	 /\  \         /\__\         /\__\         /\  \         |\__\         /\  \
     	/::\  \       /::|  |       /::|  |       /::\  \        |:|  |       /::\  \
        /:/\:\  \     /:|:|  |      /:|:|  |      /:/\:\  \       |:|  |      /:/\:\  \
       /::\~\:\  \   /:/|:|  |__   /:/|:|  |__   /::\~\:\  \      |:|__|__   /::\~\:\  \
      /:/\:\ \:\__\ /:/ |:| /\__\ /:/ |:| /\__\ /:/\:\ \:\__\ ____/::::\__\ /:/\:\ \:\__\
      \/__\:\/:/  / \/__|:|/:/  / \/__|:|/:/  / \:\~\:\ \/__/ \::::/~~/~    \:\~\:\ \/__/
     	  \::/  /      |:/:/  /      |:/:/  /   \:\ \:\__\    ~~|:|~~|      \:\ \:\__\
     	  /:/  /       |::/  /       |::/  /     \:\ \/__/      |:|  |       \:\ \/__/
     	 /:/  /        /:/  /        /:/  /       \:\__\        |:|  |        \:\__\
     	 \/__/         \/__/         \/__/         \/__/         \|__|         \/__/
 1 2 3 4 5 6 7 8 9
     + + + +
 1 2 3 4 5 6 7 8 9
     +0,+1,+2,+3
     n:[] :: n = 0|n = 1|n = 2|n = 3
     [ ] [ ]
 1 2 3 4 5 6 7 8 9
 -1,2-3
     n:[] []{1} :: n = 0
     n:[] []{1} :: n = 2
     [   ] +
 1 2 3 4 5 6 7 8 9
 -2,+3
     n:[] []{2} :: n = 0
     n:[] :: n = 3
      */
     /*
     Touching intervals mergers
     ==========================
     A) from http://stackoverflow.com/questions/31670849/merge-overlapping-intervals
     import java.util.*;
     public class Ideone
+    {
     	public static void main (String[] args) throws java.lang.Exception
+    	{
     		ArrayList<Interval> x = new ArrayList<>();
     		x.add(new Interval(1, 3));
     		x.add(new Interval(2, 6));
     		x.add(new Interval(8, 10));
     		x.add(new Interval(15, 18));
     		x.add(new Interval(17, 20));
     		x = merge(x);
     		for(Interval i : x)
+    		{
     			System.out.println(i.getStart() + " " + i.getEnd());
+    		}
+    	}
     	public static ArrayList<Interval> merge(ArrayList<Interval> intervals) {
     		if(intervals.size() == 0 || intervals.size() == 1)
     			return intervals;
     		Collections.sort(intervals, new IntervalComparator());
     		Interval first = intervals.get(0);
     		int start = first.getStart();
     		int end = first.getEnd();
     		ArrayList<Interval> result = new ArrayList<Interval>();
     		for (int i = 1; i < intervals.size(); i++) {
     			Interval current = intervals.get(i);
     			if (current.getStart() <= end) {
     				end = Math.max(current.getEnd(), end);
     			} else {
     				result.add(new Interval(start, end));
     				start = current.getStart();
     				end = current.getEnd();
+    			}
+    		}
     		result.add(new Interval(start, end));
     		return result;
+    	}
+    }
     class Interval
+    {
     	private int start;
     	private int end;
     	Interval() {
     		start = 0;
     		end = 0;
+    	}
     	Interval(int s, int e)
+    	{
     		start = s;
     		end = e;
+    	}
     	public int getStart() {
     		return start;
+    	}
     	public int getEnd() {
     		return end;
+    	}
+    }
     class IntervalComparator implements Comparator<Interval>
+    {
     	public int compare(Interval i1, Interval i2)
+    	{
     		return i1.getStart() - i2.getStart();
+    	}
+    }
     B) from http://www.programcreek.com/2012/12/leetcode-merge-intervals/
     public List<Interval> merge(List<Interval> intervals) {
     	List<Interval> result = new ArrayList<Interval>();
     	if(intervals==null||intervals.size()==0)
     		return result;
     	Collections.sort(intervals, new Comparator<Interval>(){
     		public int compare(Interval i1, Interval i2){
     			if(i1.start!=i2.start)
     				return i1.start-i2.start;
     			else
     				return i1.end-i2.end;
+    		}
     	});
     	Interval pre = intervals.get(0);
     	for(int i=0; i<intervals.size(); i++){
     		Interval curr = intervals.get(i);
     		if(curr.start>pre.end){
     			result.add(pre);
     			pre = curr;
     		}else{
     			Interval merged = new Interval(pre.start, Math.max(pre.end, curr.end));
     			pre = merged;
+    		}
+    	}
     	result.add(pre);
     	return result;
+    }
      */

     package org.txm.macro.text
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import java.nio.charset.Charset
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.utils.*
     import javax.xml.stream.*
     import java.net.URL
     import org.apache.poi.ss.usermodel.*
     import org.apache.poi.hssf.usermodel.*
     import org.apache.poi.xssf.usermodel.*
     import org.apache.poi.ss.util.*
     @Field @Option(name="inputDirectory", usage="répertoire des fichiers Excel à traiter", widget="Folder", required=true, def="")
     def inputDirectory
     @Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
     def sheetName
     @Field @Option(name="rootTag", usage="Root tag name", widget="String", required=false, def="root")
     def rootTag
     @Field @Option(name="textTag", usage="Text tag name", widget="String", required=false, def="unit")
     def textTag
     @Field @Option(name="metadataColumnList", usage="metadataColumnList column list separated with comma", widget="String", required=false, def="meta1,meta2")
     def metadataColumnList
     @Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
     def dateColumnList
     @Field @Option(name="textColumnList", usage="textColumnList column list separated with comma", widget="String", required=false, def="textColumnList1,textColumnList2")
     def textColumnList
     if (!ParametersDialog.open(this)) return
     if (!inputDirectory.exists()) {
     	println "** ExcelDir2XML: no '"+inputDirectory.name+"' directory found. Aborting."
     	return false
+    }
     if (!inputDirectory.canRead()) {
     	println "** ExcelDir2XML: '"+inputDirectory.name+"' directory not readable. Aborting."
     	return false
+    }
     def f = []
     inputDirectory.eachFileMatch(~/.*xlsx/) { f << it }
     if (f.size() == 0) {
     	println "** ExcelDir2XML: no .xlsx file found. Aborting."
     	return false
+    }
     try {
     f.sort { it.name }.each { inputFile ->
     	res = gse.run(Excel2XMLMacro, ["args":[
     "inputFile":inputFile,
     "sheetName":sheetName,
     "metadataColumnList":metadataColumnList,
     "dateColumnList":dateColumnList,
     "textColumnList":textColumnList,
     "rootTag":rootTag,
     "textTag":textTag,
     				"selection":selection,
     				"selections":selections,
     				"corpusViewSelection":corpusViewSelection,
     				"corpusViewSelections":corpusViewSelections,
     				"monitor":monitor]])
     			if (!res) println "** problem calling Excel2XMLMacro."
+    }
     } catch (Exception e) {
     	println "** ExcelDir2XML: unable to read input files. Aborting."
     	println e.getLocalizedMessage()
     	println e.printStackTrace()
     	return false
+    }
     return true

     package org.txm.macro.text
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import java.nio.charset.Charset
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.utils.*
     import javax.xml.stream.*
     import java.net.URL
     import java.text.SimpleDateFormat
     import org.apache.poi.ss.usermodel.*
     import org.apache.poi.hssf.usermodel.*
     import org.apache.poi.xssf.usermodel.*
     import org.apache.poi.ss.util.*
     def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
     // from http://www.java-connect.com/apache-poi-tutorials/read-all-type-of-excel-cell-value-as-string-using-poi
     def getCellValueAsString = { cell ->
             strCellValue = null
             if (cell != null) {
                 switch (cell.getCellType()) {
                 case Cell.CELL_TYPE_STRING:
                     strCellValue = cell.toString()
                     break
                 case Cell.CELL_TYPE_NUMERIC:
                     if (DateUtil.isCellDateFormatted(cell)) {
                         SimpleDateFormat dateFormat = new SimpleDateFormat("dd/MM/yyyy")
                         strCellValue = dateFormat.format(cell.getDateCellValue())
                     } else {
                         value = cell.getNumericCellValue()
                         longValue = value.longValue()
                         strCellValue = new String(longValue.toString())
+                    }
                     break
                 case Cell.CELL_TYPE_BOOLEAN:
                     strCellValue = new String(new Boolean(cell.getBooleanCellValue()).toString())
                     break
                 case Cell.CELL_TYPE_BLANK:
                     strCellValue = ""
                     break
+                }
+            }
             if (strCellValue == null) strCellValue = ""
             return strCellValue
+    }
     @Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
     File inputFile
     @Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
     def sheetName
     @Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
     def rootTag
     @Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
     def textTag
     @Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
     def metadataColumnList
     @Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
     def dateColumnList
     @Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
     def textColumnList
     if (!ParametersDialog.open(this)) return
     if (!inputFile.exists()) {
     	println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
     	return false
+    }
     if (!inputFile.canRead()) {
     	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
     	return false
+    }
     try {
     metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
     dateColumnList = dateColumnList.split(",").collect { it.trim() }
     textColumnList = textColumnList.split(",").collect { it.trim() }
     textTag = textTag.trim()
     rootTag = rootTag.trim()
     wb = WorkbookFactory.create(inputFile)
     if (sheetName.length() == 0) {
     	ws = wb.getSheetAt(0)
     } else {
     	ws = wb.getSheet(sheetName)
     	if (ws == null) {
     		println "** Excel2XML: no '"+sheetName+" found. Aborting."
     		return false
+    	}
+    }
     if (ws == null) {
     	println "** Excel2XML: no sheet found. Aborting."
     	return false
+    }
     nRows = ws.getPhysicalNumberOfRows()
     println nRows+" rows."
     firstRow = ws.getRow(0)
     colMax = firstRow.getLastCellNum()
     headers = (0..colMax-1).collect { getCellValueAsString(firstRow.getCell(it)) }
     println "Headers: $headers"
     normalizedHeaders = headers.collect { stringToIndent(it) }
     ok = true
     metadataColumnList.each { m ->
     	if (!headers.contains(m)) {
     		println "** Excel2XML: missing metadataColumnList column: $m"
     		ok = false
+    	}
+    }
     textColumnList.each { t ->
     	if (!headers.contains(t)) {
     		println "** Excel2XML: missing textColumnList column: $t"
     		ok = false
+    	}
+    }
     if (!ok) { return false }
     metadataColumnIndex = metadataColumnList.collect { headers.indexOf(it) }
     dateColumnsIndex = dateColumnList.collect { headers.indexOf(it) }
     textColumnIndex = textColumnList.collect { headers.indexOf(it) }
     println "metadataColumnList = "+metadataColumnList
     println "metadataColumnIndex = "+metadataColumnIndex
     println "dateColumnList = "+dateColumnList
     println "dateColumnsIndex = "+dateColumnsIndex
     println "textColumnList = "+textColumnList
     println "textColumnIndex = "+textColumnIndex
     name = inputFile.getName()
     idx = name.lastIndexOf(".")
     if (idx > 0) name = name.substring(0, idx)
     outputFile = new File(inputFile.getParentFile(), name+".xml")
     factory = XMLOutputFactory.newInstance()
     output = new FileOutputStream(outputFile)
     writer = factory.createXMLStreamWriter(output, "UTF-8")
     writer.writeStartDocument("UTF-8","1.0")
     writer.writeCharacters("\n") // simple XML formating
     writer.writeStartElement(rootTag)
     writer.writeCharacters("\n")
     pb_n = 1
     (1..nRows-1).each { rowIndex ->
     	writer.writeCharacters("  ")
     	writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
     	writer.writeAttribute("n", ""+pb_n++)
     	writer.writeCharacters("\n") // simple XML formating
     	writer.writeCharacters("  ")
     	writer.writeStartElement(textTag)
     	metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
     		String s = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex));
     		if (s == null) s ="";
     		value = s.replaceAll("\n", ";").trim()
     		writer.writeAttribute(normalizedHeaders[colIndex], value)
     		if (colIndex in dateColumnsIndex) { // also split date attributes in day+month+year attributes
     			matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
     			writer.writeAttribute(normalizedHeaders[colIndex]+"jour", matches[0][1])
     			writer.writeAttribute(normalizedHeaders[colIndex]+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
     			writer.writeAttribute(normalizedHeaders[colIndex]+"mois", matches[0][2])
     			writer.writeAttribute(normalizedHeaders[colIndex]+"annee", matches[0][3])
+    		}
+    	}
     	writer.writeCharacters("\n")
     	writer.writeCharacters("    ")
     	writer.writeStartElement("metadata")
     	writer.writeStartElement("list")
     	writer.writeAttribute("type", "unordered")
     	writer.writeCharacters("\n")
      	metadataColumnIndex.each { colIndex ->
      		writer.writeStartElement("item")
      		writer.writeCharacters(headers[colIndex]+" : "+getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex)).replaceAll("\n", ";"))
     		writer.writeEndElement() // item
     		writer.writeCharacters("\n")
+    	}
     	writer.writeCharacters("    ")
     	writer.writeEndElement() // list
     	writer.writeEndElement() // head
     	writer.writeCharacters("\n")
        	textColumnIndex.each { colIndex ->
         	writer.writeCharacters("    ")
         	writer.writeStartElement(normalizedHeaders[colIndex])
     		writer.writeStartElement("p")
     		writer.writeStartElement("head")
     		writer.writeStartElement("hi")
     		writer.writeCharacters(headers[colIndex]+" : ")
     		writer.writeEndElement() // hi
     		writer.writeEndElement() // head
     		value = getCellValueAsString(ws.getRow(rowIndex).getCell(colIndex))
     		if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
     			value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
     				writer.writeStartElement("descripteur")
     				matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
     				writer.writeAttribute("type", matches[0][1])
     				writer.writeCharacters(matches[0][2])
     				writer.writeEndElement() // descripteur
+    			}
     		} else {
     			writer.writeCharacters(value)
+    		}
     		writer.writeEndElement() // p
     		writer.writeEndElement() // textColumn
     		writer.writeCharacters("\n")
+    	}
     	writer.writeCharacters("  ")
     	writer.writeEndElement() // textTag
     	writer.writeCharacters("\n")
+    }
     writer.writeEndElement() // rootTag
     writer.writeCharacters("\n")
     writer.close()
     output.close()
     println "Result file: $outputFile"
     } catch (Exception e) {
     	println "** Excel2XML: unable to read input file. Aborting."
     	println e.getLocalizedMessage()
     	println e.printStackTrace()
     	return false
+    }
     return true

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 2051