/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 3591

     package org.txm.macro.misc
     // STANDARD DECLARATIONS
     import groovy.xml.QName
     import java.nio.charset.Charset
     import java.text.DecimalFormat
     import org.txm.utils.xml.DomUtils;
     import org.txm.importer.ValidateXml;
     import groovy.util.XmlParser
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     // BEGINNING OF PARAMETERS
     @Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
     File rootDir = new File("");
     @Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
     String encoding = "iso-8859-1" // HTML files encoding
     @Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
     def debug = "true" // set true to debug the script
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     debug = ("true" == debug)
     // END OF PARAMETERS
     String corpusName = rootDir.getName()
     File srcDir = new File(rootDir, "orig");
     File outDir = new File(rootDir, "xhtml");
     File outDir2 = new File(rootDir, corpusName);
     File rejected = new File(rootDir, "duplicates");
     File tmpDir = new File(rootDir, "tmp");
     if (!srcDir.exists()) {
     	println "STOP, srcDir does not exists $srcDir"
     	return;
+    }
     outDir.deleteDir()
     outDir.mkdir()
     outDir2.deleteDir()
     outDir2.mkdir()
     rejected.deleteDir()
     rejected.mkdir()
     tmpDir.deleteDir()
     tmpDir.mkdir()
     def allTags = new HashSet<String>();
     def allStyles = new HashSet<String>();
     def allClasses = new HashSet<String>();
     def newPrefix = "Numéro de document : "
     int itext = 1;
     def formater = new DecimalFormat("0000");
     int LIMITDIFF = 10
     def metadatas = ["DocPublicationName", "DocHeader"]
     def files = []
     srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
     files = files.sort()
     def done = new HashSet<String>();
     def ignored = []
     def allTitles = [:]
     def dones = [:]
     def ignoreds = []
     def getText(def node) {
     	//if (debug) println "node: "+node
     	String s = " ";
     	if (node instanceof String) {
     		s += " "+node
     	} else {
     		for(def c : node.children())
     			s += " "+getText(c)
+    	}
     	//println " "+s.replace("\n", " ").trim();
     	return " "+s.replace("\n", " ").trim();
+    }
     println "Nb of HTML files: "+files.size()
     for (File htmlFile : files) {
     	println "Processing file $htmlFile"
     	File tmpHTML = new File(tmpDir, htmlFile.getName())
     	tmpHTML.withWriter("UTF-8") { writer ->
     		String txt = htmlFile.getText(encoding)
     		txt = txt.replaceAll("<p></p>", " ");
     		txt = txt.replaceAll("<p> </p>", " ");
     		txt = txt.replaceAll("<br>", "<br> ");
     		writer.write(txt)
+    	}
     	String name = htmlFile.getName()
     	name = name.substring(0, name.lastIndexOf("."));
     	File xhtmlFile = new File(outDir, name+".xhtml")
     	xhtmlFile.withWriter("UTF-8") { out ->
     		def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
     		println "current charset: "+doc.charset()
     		doc.charset(Charset.forName("UTF-8"))
     		println "current charset: "+doc.charset()
     		out.println(doc.outerHtml())
+    	}
     	if (ValidateXml.test(xhtmlFile)) {
     		def root = new XmlParser().parse(xhtmlFile)
     		def tables = root.body.table.tbody.tr.td
     		if (tables.size() == 0) tables = root.body.table.tr.td
     		//println "Nb of txt : "+tables.size()
     		for (def text : tables) {
     			String sign = ""
     			if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
     			//println "TEXT "
     			//text.setName("text")
     			boolean endOfText = false;
     			def textMetadatas = [:]
     			for (String metadata : metadatas) {
     				textMetadatas[metadata] = ""
+    			}
     			for (def subtable : text.table) text.remove(subtable)
     			for (def p : text.table.p) p.addChild(" ")
     			for (def child : text.span) {
     				if ("color:red; font-weight:bold".equals(child.@style)) {
     					//text.remove(child)
     					if (debug) "Found bold: "+child
     					child.replaceNode { node -> w(expFound: "y", child.text())}
+    				}
+    			}
     			def startIgnoringText = false
     			def tmp =""
     			def ichar = 0
     			String title = "";
     			def ignoredText = ""
     			def children = text.children()
     			for (int idx = 0 ; idx < children.size() ; idx++) {
     				def child  = children[idx]
     				if (debug) println "child: $child"
     				if (startIgnoringText) {
     					if (debug) println "Ignoring text : "+ignoredText
     					if (child instanceof String) ignoredText += child
     					else ignoredText += child.text()
     					def t = text.children().remove(idx);
     					//if (tmp.length() > 0) println "removing : "+t
     					idx--
     					continue; // next child
+    				}
     				if (child instanceof String) {
     					//println " "+child
     					ichar += child.length()
     				} else {
     					ichar += child.text().length()
     					//		allTags.add(child.name().getLocalPart())
     					//		allClasses.add(child.@class)
     					//		allStyles.add(child.@style)
     					def nn = child.name()
     					try {nn = nn.getLocalPart()} catch(Exception e) {}
     					switch (nn) {
     						case "br": break;
     						case "span":
     						if (debug) println "Found span $child"
     							String classV = child.@class
     							String style = child.@style
     							if (classV != null) {
     								if (metadatas.contains(classV)) {
     									textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
     									text.remove(child);
     									idx--
     									//println "METADATA: "+classV + " = "+child.text().trim().length();
     								} else if ("TitreArticleVisu" == classV) {
     									title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
     									child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
     								} else {
     									println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
+    								}
     							} else if (style != null) {
     								if ("color:red; font-weight:bold".equals(style)) {
     									//child.replaceNode { node -> w(expFound: "test")	}
     									//println "KEYWORD: "+child.text().trim();
     								} else {
     									println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
+    								}
     							} else {
     								println "UNKNOWED SPAN: "+child.text().trim();
+    							}
     							break;
     						case "a": break
     						case "w": break;
     						case "b":
     							startIgnoringText = true;
     							tmp = child.text()
     							//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
     							text.remove(child);
     							idx--
     							break;
     						case "i": break;
     						case "font":
     							if (debug) println "Found font $child"
     							String style = child.@style
     							if ("font-style:italic;" == style) {
     								if (debug) println "ITALIC: "+getText(child).trim();
     								child.replaceNode { node -> i(getText(child))}
     							} else if ("font-weight:bold;") {
     								if (debug) println "BOLD: "+getText(child).trim();
     								child.replaceNode { node -> b(getText(child))}
     							} else {
     								println "FSTYLE: '"+style+"' = "+getText(child).trim();
+    							}
     							break;
     						default: println child.name()
+    					}
+    				}
+    			}
     			//rename td to text
     			text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
     			//Write metadatas
     			for( String metadata : metadatas) {
     				text.attributes().put(metadata, textMetadatas[metadata])
     				//sign+= " "+textMetadatas[metadata].trim()
+    			}
     			// get document number
     			ignoredText = ignoredText.replaceAll("\n", " ")
     			int iNo= ignoredText.indexOf(newPrefix);
     			//println ignoredText
     			if (iNo >= 0) {
     				String no =ignoredText.substring(iNo+newPrefix.length()).trim()
     				text.attributes().put("idnews", no)
     				//sign += " "+no
     				text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
     				text.attributes().put("yyyymmdd", no.substring(5,13))
     				text.attributes().put("yyyymm", no.substring(5,11))
     				text.attributes().put("yyyy", no.substring(5,9))
     				text.attributes().put("mm", no.substring(9,11))
     				text.attributes().put("dd", no.substring(11,13))
+    			}
     			//sign += " "+ichar
     			sign += " "+title
     			if (allTitles[title] == null) allTitles[title] = ichar
     			if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
     				sign += " "+ichar
+    			}
     			File xmlFile;
     			if (done.contains(sign)) {
     				ignored << sign
     				xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
     				ignoreds << xmlFile.getName()
     			} else {
     				done << sign;
     				xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
     				dones[sign] = xmlFile
+    			}
     			def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
     				writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
     				new XmlNodePrinter(writer).print(text)
+    		}
+    	}
+    }
     if (ignored.size() > 0) {
     	File ignoredFile = new File (rejected, "ignored.txt");
     	ignoredFile.withWriter("UTF-8") { writer ->
     		writer.println "TOTAL: "+ignored.size()
     		for (int i = 0 ; i < ignored.size() ; i++) {
     			def sign = ignored[i]
     			writer.println "\n**DUPLICATE\n "
     			writer.println "keeped="+dones[sign];
     			writer.println "rejected="+ignoreds[i];
     			writer.println "SIGN="+sign
     			writer.println "\n"
+    		}
+    	}
     	println "TOTAL IGNORED: "+ignored.size()
+    }
     println "TOTAL TEXT: $itext"
     if (!debug) {
     	outDir.deleteDir()
     	tmpDir.deleteDir()
+    }

     package org.txm.macro.misc
     // STANDARD DECLARATIONS
     import groovy.xml.QName
     import java.nio.charset.Charset
     import java.text.DecimalFormat
     import org.txm.utils.xml.DomUtils;
     import org.txm.importer.ValidateXml;
     import groovy.util.XmlParser
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     // BEGINNING OF PARAMETERS
     @Field @Option(name="rootDir", usage="The directory contains the 'orig' directory which contains the html files", widget="Folder", required=false, def="/path")
     File rootDir = new File("");
     @Field @Option(name="encoding", usage="HTML encoding", widget="String", required=false, def="iso-8859-1")
     String encoding = "iso-8859-1" // HTML files encoding
     @Field @Option(name="debug", usage="show debug messages. Values = true, false", widget="String", required=false, def="false")
     def debug = "true" // set true to debug the script
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     debug = ("true" == debug)
     // END OF PARAMETERS
     String corpusName = rootDir.getName()
     File srcDir = new File(rootDir, "orig");
     File outDir = new File(rootDir, "xhtml");
     File outDir2 = new File(rootDir, corpusName);
     File rejected = new File(rootDir, "duplicates");
     File tmpDir = new File(rootDir, "tmp");
     if (!srcDir.exists()) {
     	println "STOP, srcDir does not exists $srcDir"
     	return;
+    }
     outDir.deleteDir()
     outDir.mkdir()
     outDir2.deleteDir()
     outDir2.mkdir()
     rejected.deleteDir()
     rejected.mkdir()
     tmpDir.deleteDir()
     tmpDir.mkdir()
     def allTags = new HashSet<String>();
     def allStyles = new HashSet<String>();
     def allClasses = new HashSet<String>();
     def newPrefix = "Numéro de document : "
     int itext = 1;
     def formater = new DecimalFormat("0000");
     int LIMITDIFF = 10
     def metadatas = ["DocPublicationName", "DocHeader"]
     def files = []
     srcDir.eachFileMatch(~/.*\.html/){ htmlFile -> files << htmlFile}
     files = files.sort()
     def done = new HashSet<String>();
     def ignored = []
     def allTitles = [:]
     def dones = [:]
     def ignoreds = []
     def getText(def node) {
     	//if (debug) println "node: "+node
     	String s = " ";
     	if (node instanceof String) {
     		s += " "+node
     	} else {
     		for(def c : node.children())
     			s += " "+getText(c)
+    	}
     	//println " "+s.replace("\n", " ").trim();
     	return " "+s.replace("\n", " ").trim();
+    }
     println "Nb of HTML files: "+files.size()
     for (File htmlFile : files) {
     	println "Processing file $htmlFile"
     	File tmpHTML = new File(tmpDir, htmlFile.getName())
     	tmpHTML.withWriter("UTF-8") { writer ->
     		String txt = htmlFile.getText(encoding)
     		txt = txt.replaceAll("<p></p>", " ");
     		txt = txt.replaceAll("<p> </p>", " ");
     		txt = txt.replaceAll("<br>", "<br> ");
     		writer.write(txt)
+    	}
     	String name = htmlFile.getName()
     	name = name.substring(0, name.lastIndexOf("."));
     	File xhtmlFile = new File(outDir, name+".xhtml")
     	xhtmlFile.withWriter("UTF-8") { out ->
     		def doc = org.jsoup.Jsoup.connect(tmpHTML.toURI().toURL().toString());
     		println "current charset: "+doc.charset()
     		doc.charset(Charset.forName("UTF-8"))
     		println "current charset: "+doc.charset()
     		out.println(doc.outerHtml())
+    	}
     	if (ValidateXml.test(xhtmlFile)) {
     		def root = new XmlParser().parse(xhtmlFile)
     		def tables = root.body.table.tbody.tr.td
     		if (tables.size() == 0) tables = root.body.table.tr.td
     		//println "Nb of txt : "+tables.size()
     		for (def text : tables) {
     			String sign = ""
     			if (! text instanceof groovy.util.Node) { println "NOT NODE: "+text; continue; }
     			//println "TEXT "
     			//text.setName("text")
     			boolean endOfText = false;
     			def textMetadatas = [:]
     			for (String metadata : metadatas) {
     				textMetadatas[metadata] = ""
+    			}
     			for (def subtable : text.table) text.remove(subtable)
     			for (def p : text.table.p) p.addChild(" ")
     			for (def child : text.span) {
     				if ("color:red; font-weight:bold".equals(child.@style)) {
     					//text.remove(child)
     					if (debug) "Found bold: "+child
     					child.replaceNode { node -> w(expFound: "y", child.text())}
+    				}
+    			}
     			def startIgnoringText = false
     			def tmp =""
     			def ichar = 0
     			String title = "";
     			def ignoredText = ""
     			def children = text.children()
     			for (int idx = 0 ; idx < children.size() ; idx++) {
     				def child  = children[idx]
     				if (debug) println "child: $child"
     				if (startIgnoringText) {
     					if (debug) println "Ignoring text : "+ignoredText
     					if (child instanceof String) ignoredText += child
     					else ignoredText += child.text()
     					def t = text.children().remove(idx);
     					//if (tmp.length() > 0) println "removing : "+t
     					idx--
     					continue; // next child
+    				}
     				if (child instanceof String) {
     					//println " "+child
     					ichar += child.length()
     				} else {
     					ichar += child.text().length()
     					//		allTags.add(child.name().getLocalPart())
     					//		allClasses.add(child.@class)
     					//		allStyles.add(child.@style)
     					def nn = child.name()
     					try {nn = nn.getLocalPart()} catch(Exception e) {}
     					switch (nn) {
     						case "br": break;
     						case "span":
     						if (debug) println "Found span $child"
     							String classV = child.@class
     							String style = child.@style
     							if (classV != null) {
     								if (metadatas.contains(classV)) {
     									textMetadatas[classV] = (textMetadatas[classV] + " " + child.text()).trim()
     									text.remove(child);
     									idx--
     									//println "METADATA: "+classV + " = "+child.text().trim().length();
     								} else if ("TitreArticleVisu" == classV) {
     									title += getText(child)//.text().trim().replaceAll("\n", " ")+ " "
     									child.name = new QName(child.name().getNamespaceURI(), "head", child.name().getPrefix())
     								} else {
     									println "UNKNOWED CLASS: "+classV + " = "+child.text().trim().length();
+    								}
     							} else if (style != null) {
     								if ("color:red; font-weight:bold".equals(style)) {
     									//child.replaceNode { node -> w(expFound: "test")	}
     									//println "KEYWORD: "+child.text().trim();
     								} else {
     									println "UNKNOWED STYLE: '"+style+"' = "+child.text().trim();
+    								}
     							} else {
     								println "UNKNOWED SPAN: "+child.text().trim();
+    							}
     							break;
     						case "a": break
     						case "w": break;
     						case "b":
     							startIgnoringText = true;
     							tmp = child.text()
     							//if (tmp.length() > 0) println "START REMOVING FROM : "+tmp
     							text.remove(child);
     							idx--
     							break;
     						case "i": break;
     						case "font":
     							if (debug) println "Found font $child"
     							String style = child.@style
     							if ("font-style:italic;" == style) {
     								if (debug) println "ITALIC: "+getText(child).trim();
     								child.replaceNode { node -> i(getText(child))}
     							} else if ("font-weight:bold;") {
     								if (debug) println "BOLD: "+getText(child).trim();
     								child.replaceNode { node -> b(getText(child))}
     							} else {
     								println "FSTYLE: '"+style+"' = "+getText(child).trim();
+    							}
     							break;
     						default: println child.name()
+    					}
+    				}
+    			}
     			//rename td to text
     			text.name = new QName(text.name().getNamespaceURI(), "text", text.name().getPrefix())
     			//Write metadatas
     			for( String metadata : metadatas) {
     				text.attributes().put(metadata, textMetadatas[metadata])
     				//sign+= " "+textMetadatas[metadata].trim()
+    			}
     			// get document number
     			ignoredText = ignoredText.replaceAll("\n", " ")
     			int iNo= ignoredText.indexOf(newPrefix);
     			//println ignoredText
     			if (iNo >= 0) {
     				String no =ignoredText.substring(iNo+newPrefix.length()).trim()
     				text.attributes().put("idnews", no)
     				//sign += " "+no
     				text.attributes().put("date", no.substring(5,9)+"-"+no.substring(9,11)+"-"+no.substring(11,13))
     				text.attributes().put("yyyymmdd", no.substring(5,13))
     				text.attributes().put("yyyymm", no.substring(5,11))
     				text.attributes().put("yyyy", no.substring(5,9))
     				text.attributes().put("mm", no.substring(9,11))
     				text.attributes().put("dd", no.substring(11,13))
+    			}
     			//sign += " "+ichar
     			sign += " "+title
     			if (allTitles[title] == null) allTitles[title] = ichar
     			if (Math.abs(allTitles[title] - ichar) > LIMITDIFF) {
     				sign += " "+ichar
+    			}
     			File xmlFile;
     			if (done.contains(sign)) {
     				ignored << sign
     				xmlFile = new File(rejected, name+"_"+formater.format((itext++))+".xml")
     				ignoreds << xmlFile.getName()
     			} else {
     				done << sign;
     				xmlFile = new File(outDir2, name+"_"+formater.format((itext++))+".xml")
     				dones[sign] = xmlFile
+    			}
     			def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8")); //$NON-NLS-1$
     				writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
     				new XmlNodePrinter(writer).print(text)
+    		}
+    	}
+    }
     if (ignored.size() > 0) {
     	File ignoredFile = new File (rejected, "ignored.txt");
     	ignoredFile.withWriter("UTF-8") { writer ->
     		writer.println "TOTAL: "+ignored.size()
     		for (int i = 0 ; i < ignored.size() ; i++) {
     			def sign = ignored[i]
     			writer.println "\n**DUPLICATE\n "
     			writer.println "keeped="+dones[sign];
     			writer.println "rejected="+ignoreds[i];
     			writer.println "SIGN="+sign
     			writer.println "\n"
+    		}
+    	}
     	println "TOTAL IGNORED: "+ignored.size()
+    }
     println "TOTAL TEXT: $itext"
     if (!debug) {
     	outDir.deleteDir()
     	tmpDir.deleteDir()
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro.hyperlink
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.edition.rcp.handlers.OpenEdition
     import org.txm.edition.rcp.editors.RGBA
     /**
      * The macro use the "stringArgs" Groovy Binding to work. The format is: parameter=value + TAB + parameter2=value2 ...
+     *
      * Parameters needed:
      * - corpus: corpus id
      * - text: text id to display
      * - page: page id to display
      * - editions: optionnal edition ids to display ("default" is used if not set)
      * - wordsids: optionnal word ids to highlight&focus
+     *
      * This macro can be called from whithin TXM editions:
+     *
      * <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX    text=0002    page=3    editions=default')">Open Edition with text+page</a>.
+     *
      * <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX    text=0002    wordids=w_0002_6    editions=default')">Open Edition with text+wordid</a>.
+     *
      * <a onclick="txmcommand('id', 'org.txm.rcp.commands.ExecuteMacro', 'org.txm.rcp.command.parameter.file', 'org/txm/macro/EditionHyperlinkMacro.groovy', 'args' , 'corpus=VOEUX    text=0002    wordids=w_0002_6,w_0002_7,w_0002_8,w_0002_9    editions=default')">Open Edition with text+wordids</a>.
      */
     if (stringArgs == null) {
     	println "** Error: this macro must be called from an edition hyperlink"
     	return
+    }
     def params = stringArgs.split("\t")
     def hash = [:]
     for (def param : params) {
     	def split = param.split("=", 2)
     	hash[split[0]] = split[1]
+    }
     corpus = hash["corpus"]
     text = hash["text"]
     page = hash["page"]
     editions = hash["editions"]
     if (editions != null) editions = editions.split(",") as List
     wordids = hash["wordids"]
     if (wordids != null) wordids = wordids.split(",") as List
     println "corpus=$corpus editions=$editions text=$text page=$page wordids=$wordids"
     corpus = CorpusManager.getCorpusManager().getCorpora()[corpus]
     monitor.syncExec(new Runnable() {
     	public void run() {
     		editor = OpenEdition.openEdition(corpus, editions)
     		if (wordids != null && wordids.size() > 0) {
     			try {
     				editor.backToText(corpus.getProject().getText(text), wordids[0])
     				editor.removeHighlightWords()
     				editor.addHighlightWordsById(new RGBA(249, 208, 208), wordids)
     				editor.updateWordStyles()
     			}catch(Exception e) { e.printStackTrace()}
     		} else {
     			editor.goToText(text)
     			editor.goToPage(page)
+    		}
+    	}
     });

     // Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté
     // @author mdecorde
     // @author sheiden
     // STANDARD DECLARATIONS
     package org.txm.macro.export
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.searchengine.cqp.corpus.query.CQLQuery
     import org.txm.Toolbox
     import org.txm.utils.i18n.LangFormater;
     import org.apache.commons.lang.StringUtils;
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     if (!(corpusViewSelection instanceof CQPCorpus)) {
     	println "Please select a corpus"
     	return
+    }
     // PARAMETERS
     @Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="")
     File outputDirectory
     @Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word")
     def wordProperty
     @Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false")
     def oneWordPerLine
     @Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true")
     def oneSentencePerLine
     @Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="")
     def sentenceStructureName
     if (!ParametersDialog.open(this)) return
     // BEGINNING
     if (!outputDirectory.exists()) outputDirectory.mkdirs()
     def corpus = corpusViewSelection
     def corpusName = corpus.getName()
     def CQI = CQPSearchEngine.getCqiClient()
     if (wordProperty == null || !(wordProperty.length() > 0)) {
     	println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..."
     	return 1
+    }
     if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) {
     	println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..."
     	return 1
+    }
     if (oneSentencePerLine) {
     	lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName)
     	if (lineSeparatorStructure == null) {
     		println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..."
     		return 1
+    	}
     	breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds())
+    }
     println "Exporting $corpus text content to $outputDirectory..."
     def wordPropertyI = corpus.getProperty(wordProperty)
     if (wordPropertyI == null) {
     	println "** No '$wordProperty' word property in the $corpus corpus. Aborting..."
     	return 1
+    }
     def textidProperty = corpus.getStructuralUnit("text").getProperty("id")
     def textStartBoundaries = corpus.getTextStartLimits()
     def textEndBoundaries = corpus.getTextEndLimits()
     int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries)
     String[] textids =  CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos)
     if (textStartBoundaries.size() == 1) {
     	println "1 text"
     	} else {
     	println ""+textStartBoundaries.size()+" texts"
+    }
     for (int i = 0 ; i < textStartBoundaries.size() ; i++) {
     	int start = textStartBoundaries[i]
     	int end = textEndBoundaries[i]
     	File txtFile = new File(outputDirectory, textids[i]+".txt")
     	print "."
     	def writer = txtFile.newWriter("UTF-8")
     	int[] positions = new int[end - start + 1]
     	int c = 0
     	for (int p : start..end) {
     		positions[c++] = p
+    	}
     	int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions)
     	def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx)
     	def tmp = []
     	for (int j = 0 ; j < positions.length ; j++) {
     		int p = positions[j]
     		tmp << words[j]
     		if (oneSentencePerLine && breaks_pos.contains(p)) {
     			if (oneWordPerLine) {
     				tmp.each { word -> writer.println word }
     			} else {
     				writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
+    			}
     			tmp = []
+    		}
+    	}
     	if (tmp.size() > 0) {
     		if (oneWordPerLine) {
     			tmp.each { word -> writer.println word }
     		} else {
     			writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
+    		}
+    	}
     	writer.close()
+    }
     println "\nDone, result saved in "+outputDirectory.getAbsolutePath()

     // Copyright © 2020-2022 ENS de Lyon, CNRS, University of Franche-Comté
     // @author mdecorde
     // @author sheiden
     // STANDARD DECLARATIONS
     package org.txm.macro.export
     import org.txm.searchengine.cqp.CQPSearchEngine
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.searchengine.cqp.corpus.query.CQLQuery
     import org.txm.Toolbox
     import org.txm.utils.i18n.LangFormater;
     import org.apache.commons.lang.StringUtils;
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     if (!(corpusViewSelection instanceof CQPCorpus)) {
     	println "Please select a corpus"
     	return
+    }
     // PARAMETERS
     @Field @Option(name="outputDirectory", usage="results output directory", widget="Folder", required=true, def="")
     File outputDirectory
     @Field @Option(name="wordProperty", usage="word property to export", widget="String", required=true, def="word")
     def wordProperty
     @Field @Option(name="oneWordPerLine", usage="output one word per line", widget="Boolean", required=false, def="false")
     def oneWordPerLine
     @Field @Option(name="oneSentencePerLine", usage="output one sentence per line", widget="Boolean", required=false, def="true")
     def oneSentencePerLine
     @Field @Option(name="sentenceStructureName", usage="name of the structure encoding sentences", widget="String", required=false, def="")
     def sentenceStructureName
     if (!ParametersDialog.open(this)) return
     // BEGINNING
     if (!outputDirectory.exists()) outputDirectory.mkdirs()
     def corpus = corpusViewSelection
     def corpusName = corpus.getName()
     def CQI = CQPSearchEngine.getCqiClient()
     if (wordProperty == null || !(wordProperty.length() > 0)) {
     	println "** Please provide a word property name in parameter 'wordProperty', for example 'word'. Aborting..."
     	return 1
+    }
     if (oneSentencePerLine && (sentenceStructureName == null || !(sentenceStructureName.length() > 0))) {
     	println "** Please provide a name for the structure encoding sentences in parameter 'sentenceStructureName', or uncheck parameter 'oneSentencePerLine'. Aborting..."
     	return 1
+    }
     if (oneSentencePerLine) {
     	lineSeparatorStructure = corpus.getStructuralUnit(sentenceStructureName)
     	if (lineSeparatorStructure == null) {
     		println "** No $sentenceStructureName structure in the $corpus corpus. Aborting..."
     		return 1
+    	}
     	breaks_pos = Arrays.asList(corpus.query(new CQLQuery("[]</"+sentenceStructureName+">"),"test", false).getEnds())
+    }
     println "Exporting $corpus text content to $outputDirectory..."
     def wordPropertyI = corpus.getProperty(wordProperty)
     if (wordPropertyI == null) {
     	println "** No '$wordProperty' word property in the $corpus corpus. Aborting..."
     	return 1
+    }
     def textidProperty = corpus.getStructuralUnit("text").getProperty("id")
     def textStartBoundaries = corpus.getTextStartLimits()
     def textEndBoundaries = corpus.getTextEndLimits()
     int[] struct_pos = CQI.cpos2Struc(textidProperty.getQualifiedName(), textStartBoundaries)
     String[] textids =  CQI.struc2Str(textidProperty.getQualifiedName(), struct_pos)
     if (textStartBoundaries.size() == 1) {
     	println "1 text"
     	} else {
     	println ""+textStartBoundaries.size()+" texts"
+    }
     for (int i = 0 ; i < textStartBoundaries.size() ; i++) {
     	int start = textStartBoundaries[i]
     	int end = textEndBoundaries[i]
     	File txtFile = new File(outputDirectory, textids[i]+".txt")
     	print "."
     	def writer = txtFile.newWriter("UTF-8")
     	int[] positions = new int[end - start + 1]
     	int c = 0
     	for (int p : start..end) {
     		positions[c++] = p
+    	}
     	int[] idx = CQI.cpos2Id(wordPropertyI.getQualifiedName(), positions)
     	def words = CQI.id2Str(wordPropertyI.getQualifiedName(), idx)
     	def tmp = []
     	for (int j = 0 ; j < positions.length ; j++) {
     		int p = positions[j]
     		tmp << words[j]
     		if (oneSentencePerLine && breaks_pos.contains(p)) {
     			if (oneWordPerLine) {
     				tmp.each { word -> writer.println word }
     			} else {
     				writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
+    			}
     			tmp = []
+    		}
+    	}
     	if (tmp.size() > 0) {
     		if (oneWordPerLine) {
     			tmp.each { word -> writer.println word }
     		} else {
     			writer.println LangFormater.format(tmp.join(" "), corpus.getLang())
+    		}
+    	}
     	writer.close()
+    }
     println "\nDone, result saved in "+outputDirectory.getAbsolutePath()

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 3591