/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

Révision 1543

     package org.txm.macro.conversion
     // STANDARD DECLARATIONS
     import groovy.xml.QName
     import java.text.DecimalFormat
     import org.txm.importer.DomUtils
     import org.txm.importer.ValidateXml
     import org.w3c.tidy.Tidy
     import groovy.util.XmlParser
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.jsoup.Jsoup
     import org.jsoup.nodes.Document.OutputSettings.Syntax
     // BEGINNING OF PARAMETERS
     @Field @Option(name="inputDir", usage="The directory containing the html files, to export from the Europress portal", widget="Folder", required=true, def="")
     def inputDir
     @Field @Option(name="inputEncoding", usage="character encoding used in the HTML exported files", widget="String", required=false, def="iso-8859-1")
     String inputEncoding
     @Field @Option(name="outputDir", usage="The directory containing the result files, to import with the XTZ+CSV import module into TXM", widget="Folder", required=true, def="")
     def outputDir
     @Field @Option(name="corpusName", usage="corpus name", widget="String", required=true, def="")
     String corpusName
     @Field @Option(name="columnSeparator",usage="", widget="String", required=false, def=",")
     def columnSeparator
     @Field @Option(name="txtSeparator",usage="", widget="String", required=false, def="\"")
     def txtSeparator
     @Field @Option(name="debug", usage="show debug messages and keep temporary results", widget="Boolean", required=false, def="false")
     def debug
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return
     // END OF PARAMETERS
     if (!inputDir.exists()) {
     	println "** inputDir does not exist: $inputDir, aborting."
     	return false
+    }
     xslposttokContent = """<?xml version="1.0"?>
     <xsl:stylesheet xmlns:edate="http://exslt.org/dates-and-times"
       xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:tei="http://www.tei-c.org/ns/1.0"
       xmlns:txm="http://textometrie.org/ns/1.0"
       exclude-result-prefixes="tei edate" xpath-default-namespace="http://www.tei-c.org/ns/1.0" version="2.0">
       <!--
     This software is dual-licensed:
 . Distributed under a Creative Commons Attribution-ShareAlike 3.0
     Unported License http://creativecommons.org/licenses/by-sa/3.0/
 . http://www.opensource.org/licenses/BSD-2-Clause
     All rights reserved.
     Redistribution and use in source and binary forms, with or without
     modification, are permitted provided that the following conditions are
     met:
     * Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.
     * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
     This software is provided by the copyright holders and contributors
     "as is" and any express or implied warranties, including, but not
     limited to, the implied warranties of merchantability and fitness for
     a particular purpose are disclaimed. In no event shall the copyright
     holder or contributors be liable for any direct, indirect, incidental,
     special, exemplary, or consequential damages (including, but not
     limited to, procurement of substitute goods or services; loss of use,
     data, or profits; or business interruption) however caused and on any
     theory of liability, whether in contract, strict liability, or tort
     (including negligence or otherwise) arising in any way out of the use
     of this software, even if advised of the possibility of such damage.
     This stylesheet adds a ref attribute to w elements that will be used for
     references in TXM concordances. Can be used with TXM XTZ import module.
     w ref is composed of :
     - docpublicationname
     - date
     Written by Alexei Lavrentiev, UMR 5317 IHRIM, 2017
     Serge Heiden, UMR 5317 IHRIM, 2018
       -->
       <xsl:output method="xml" encoding="utf-8" omit-xml-declaration="no"/>
       <!-- General patterns: all elements, attributes, comments and processing instructions are copied -->
       <xsl:template match="*">
             <xsl:copy>
               <xsl:apply-templates select="*|@*|processing-instruction()|comment()|text()"/>
             </xsl:copy>
       </xsl:template>
       <xsl:template match="*" mode="position"><xsl:value-of select="count(preceding-sibling::*)"/></xsl:template>
       <xsl:template match="@*|comment()|processing-instruction()">
         <xsl:copy/>
       </xsl:template>
       <xsl:template match="*:w">
         <xsl:variable name="ref">
     	<xsl:choose>
              <xsl:when test="ancestor::*:text[1]/@ref">
                <!-- <xsl:text>ref: </xsl:text> -->
     	   <xsl:value-of select="ancestor::*:text[1]/@ref"/>
              </xsl:when>
              <xsl:otherwise>
               <!-- <xsl:text>[NO docpublicationname]</xsl:text> -->
              </xsl:otherwise>
            </xsl:choose>
     	<xsl:choose>
              <xsl:when test="ancestor::*:text[1]/@docpublicationname">
                <!-- <xsl:text>docpublicationname: </xsl:text> -->
     	   <xsl:value-of select="ancestor::*:text[1]/@docpublicationname"/>
              </xsl:when>
              <xsl:otherwise>
               <!-- <xsl:text>[NO docpublicationname]</xsl:text> -->
              </xsl:otherwise>
            </xsl:choose>
     	<xsl:choose>
              <xsl:when test="ancestor::*:text[1]/@date">
                <!-- <xsl:text>date: </xsl:text> -->
     	   <xsl:value-of select="ancestor::*:text[1]/@date"/>
              </xsl:when>
              <xsl:otherwise>
               <!-- <xsl:text>[NO date]</xsl:text> -->
              </xsl:otherwise>
            </xsl:choose>
     <!--
           <xsl:if test="ancestor::*:text/@*:id and preceding::*:pb[1]/@n">
             <xsl:text>, </xsl:text>
           </xsl:if>
     -->
           <xsl:if test="ancestor::*:p[1]/@n">
             <xsl:text>§ </xsl:text>
             <xsl:value-of select="ancestor::*:p[1]/@n"/>
           </xsl:if>
     <!--
           <xsl:if test="preceding::*:pb[1]/@n">
             <xsl:text>p. </xsl:text>
             <xsl:value-of select="preceding::*:pb[1]/@n"/>
           </xsl:if>      <xsl:if test="(ancestor::*:text/@*:id or preceding::*:pb[1]/@n) and preceding::*:lb[1]/@n">
             <xsl:text>, </xsl:text>
           </xsl:if>
           <xsl:if test="preceding::*:lb[1]/@n">
             <xsl:text>l. </xsl:text>
             <xsl:value-of select="preceding::*:lb[1]/@n"/>
           </xsl:if>
     -->
         </xsl:variable>
             <xsl:copy>
               <xsl:apply-templates select="@*"/>
               <xsl:attribute name="ref"><xsl:value-of select="\$ref"/></xsl:attribute>
               <xsl:apply-templates select="*|processing-instruction()|comment()|text()"/>
             </xsl:copy>
       </xsl:template>
     </xsl:stylesheet>
     """
     cssContent = """/*
        Copyright © 2017 ENS de Lyon, CNRS, University of Franche-Comté
        Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
        @author cbourdot
        @author sheiden
        TXM default CSS 06-2017
     */
     .txmeditionpage {
     	font-size: 14px;
     	text-indent: none;
     	text-align: justify;
     	box-shadow: .3125em .3125em .625em 0 #888;
     	margin: 1.25em auto;
     	padding: 1.25em;
     	width: 400px;
     	min-height: 90%;
+    }
     .txmeditionpb {
     	text-align: center;
+    }
     .txmeditionpb::before {
     	content: "- ";
+    }
     .txmeditionpb::after {
     	content: " -";
+    }
     .txmlettrinep:first-letter {
         float: left;
         font-size: 6em;
         line-height: 1;
         margin-right: 0.2em;
+    }
     a {
     	color:#802520;
+    }
     h1 {
     	font-size: 20px;
     	font-variant: small-caps;
     	text-align: center;
     	color:#802520;
+    }
     h2 {
     	font-size: 18px;
     	font-variant: small-caps;
     	text-align: center;
     	color:#802520;
+    }
     h3 {
     	font-size: 16px;
     	font-variant: small-caps;
     	text-align: center;
     	color:#802520;
+    }
     p {
         	text-indent: 0.2cm;
     	text-align: justify;
         	text-justify: inter-word;
+      }
     img {
         margin: 10px 10px 10px 10px;
+    }
     td[rend="table-cell-align-right"] {
     	text-align: right;
+    }
     td[rend="table-cell-align-left"] {
     	text-align: left;
+    }
     td[rend="table-cell-align-center"] {
     	text-align: center;
+    }
     """
     outputDir.deleteDir()
     outputDir.mkdir()
     outputDir = new File(outputDir, corpusName)
     outputDir.deleteDir()
     outputDir.mkdir()
     tmpDir = new File(outputDir, "tmp")
     tmpDir.deleteDir()
     tmpDir.mkdir()
     tmpXhtmlOutput = new File(outputDir, "xhtml")
     tmpXhtmlOutput.deleteDir()
     tmpXhtmlOutput.mkdir()
     duplicates = new File(outputDir, "duplicates")
     duplicates.deleteDir()
     duplicates.mkdir()
     xslDir = new File(outputDir, "xsl")
     xslDir.mkdir()
     xslposttokDir = new File(xslDir, "3-posttok")
     xslposttokDir.mkdir()
     xslposttokFile = new File(xslposttokDir, "txm-posttok-addRef-ref.xsl")
     cssDir = new File(outputDir, "css")
     cssDir.mkdir()
     cssFile = new File(cssDir, corpusName+".css")
     xslposttokFile << xslposttokContent
     cssFile << cssContent
     metadataFile = new File(outputDir, "metadata.csv")
     metadataWriter = metadataFile.newWriter("UTF-8")
     int itext = 0
     def formater = new DecimalFormat("0000")
     // HTML elements containing metadata content, with @class=metadataKeys
     def metadataKeys = ["DocPublicationName", "DocHeader", "titreArticle"]
     // HTML elements containing text content, with @class=textClass
     def textClass = "docOcurrContainer"
     // write metadata header
     metadataWriter.print "id"
     metadataKeys.each { metadataWriter.print columnSeparator+it.toLowerCase() }
     // DocHeader substrings
     metadataWriter.print columnSeparator+"rubrique"
     metadataWriter.print columnSeparator+"date"
     metadataWriter.print columnSeparator+"words"
     metadataWriter.print columnSeparator+"pages"
     metadataWriter.print columnSeparator+"textorder" // date
     metadataWriter.println ""
     def files = []
     inputDir.eachFileMatch(~/.*\.(html|HTML)/){ htmlFile -> files << htmlFile}
     files = files.sort()
     def done = new HashSet<String>()
     def ignored = []
     def allTitles = new HashSet()
     def dones = [:]
     def ignoreds = []
     // scan node for text content
     def getText(def node) {
     	//if (debug) println "node: "+node
     	String s = " "
     	if (node instanceof String) {
     		s += " "+node
     	} else {
     		for(def c : node.children())
     			s += " "+getText(c)
+    	}
     	//println " "+s.replace("\n", " ").trim()
     	return " "+s.replace("\n", " ").trim()
+    }
     println files.size()+" files to process."
     println "Creating $metadataFile"
     for (File htmlFile : files) {
     	println "Processing $htmlFile"
     	String name = htmlFile.getName()
     	name = name.substring(0, name.lastIndexOf("."))
     	File xhtmlFile = new File(tmpXhtmlOutput, name+".xhtml")
     	if (inputEncoding.size() > 0) {
     		doc = Jsoup.parse(htmlFile, inputEncoding, "")
     	} else{
     		doc = Jsoup.parse(htmlFile, "UTF8")
+    	}
     	doc.outputSettings().escapeMode(org.jsoup.nodes.Entities.EscapeMode.xhtml)
     	doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml)
     	xhtmlFile.withWriter("UTF-8") { out ->
     		out.print doc.html().replace("\"=\"\"", "")
+    	}
     	if (!ValidateXml.test(xhtmlFile)) {
     		println "Error: $xhtmlFile is malformed."
     		continue
+    	}
     	def root = new XmlParser(false, true, true).parse(xhtmlFile)
     	// one <article> per text
     	for (def article : root.body.article) {
     		def textMetadata = [:]
     		for (def key : metadataKeys) {
     			def values = article.'**'.find { node -> node instanceof groovy.util.Node && node["@class"] == key }
     			textMetadata[key] = values
+    		}
     		def textContent = article.'**'.find { node -> node instanceof groovy.util.Node && node["@class"] == textClass }
     		String sign = getText(textMetadata["titreArticle"]) // identify a text
     		// build text id
     		itext++
     		File xmlFile = xmlFile = new File(outputDir, name+"_"+formater.format((itext))+".xml")
     		if (allTitles.contains(sign)) {
     				ignored << sign
     				xmlFile = new File(duplicates, name+"_"+formater.format(itext)+".xml")
     				ignoreds << xmlFile.getName()
+    		}
     		allTitles.add(sign)
     		textId = name+"_"+formater.format(itext)
     		def rubrique
     		def date
     		def words
     		def pages
     		def textorder
     		def ref
     		// write metadata
     		metadataWriter.print "$textId"
     		for (def k : textMetadata.keySet()) {
     			value = getText(textMetadata[k])
     			if (value == null) value = "N/A"
     			def potentialDate
     			if (k == "DocHeader") { // date, words, pages
     				docHeaderParse = (value =~ /^(.*)((lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche) [0-9][0-9]? (janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre) [0-9]{4}) - ([0-9]+) mots, p\. (.*)$/)
     				if (docHeaderParse.size() == 1) {
     					date = docHeaderParse[0][2]
     					style = java.text.DateFormat.FULL
     					df = java.text.DateFormat.getDateInstance(style, java.util.Locale.FRANCE)
     					potentialDate = df.parse(date)
     					formatter = new java.text.SimpleDateFormat("yyyy-MM-dd")
     					rubrique = docHeaderParse[0][1].trim()
     					date = formatter.format(potentialDate)
     					textorder = date
     					words = docHeaderParse[0][5]
     					pages = docHeaderParse[0][6]
     				} else { // date
     					docHeaderParse = (value =~ /^(.*)((lundi|mardi|mercredi|jeudi|vendredi|samedi|dimanche) ([0-9][0-9]?) (janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre) ([0-9]{4}))(.*)$/)
     					if (docHeaderParse.size() == 1) {
     						rubrique = "NA"
     						date = docHeaderParse[0][2]
     						style = java.text.DateFormat.FULL
     						df = java.text.DateFormat.getDateInstance(style, java.util.Locale.FRANCE)
     						try {
     							potentialDate = df.parse(date)
     							formatter = new java.text.SimpleDateFormat("yyyy-MM-dd")
     							date = formatter.format(potentialDate)
     						} catch (Exception e) {
     							println "can't parse date: '$date'"
     							date = "NA"
+    						}
     						textorder = date
     						words = "NA"
     						pages = "NA"
     					} else {
     						rubrique = "NA"
     						date = "NA"
     						textorder = date
     						words = "NA"
     						pages = "NA"
+    					}
+    				}
     			} else if (k == "DocPublicationName") ref = value.trim()
     			metadataWriter.print columnSeparator+txtSeparator+value.replaceAll("\n", "").trim().replaceAll(txtSeparator,txtSeparator+txtSeparator)+txtSeparator
+    		}
     		ref = ref+", "+date
     		metadataWriter.print columnSeparator+txtSeparator+rubrique.replaceAll(txtSeparator,txtSeparator+txtSeparator)+txtSeparator
     		metadataWriter.print columnSeparator+txtSeparator+date.replaceAll(txtSeparator,txtSeparator+txtSeparator)+txtSeparator
     		metadataWriter.print columnSeparator+txtSeparator+words+txtSeparator
     		metadataWriter.print columnSeparator+txtSeparator+pages+txtSeparator
     		metadataWriter.print columnSeparator+txtSeparator+textorder+txtSeparator
     		metadataWriter.println ""
     		// write content
     		def writer = new PrintWriter(new OutputStreamWriter(new FileOutputStream(xmlFile) , "UTF-8"))
     		writer.println "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
     		textContent.name = "text" // set root tag to "text"
     		textContent["@id"] = textId // set the text id
     		textContent["@ref"] = ref
     		// write XML file
     		new XmlNodePrinter(writer).print(textContent)
     		writer.close()
+    	}
     //	File xmlFile =
     	//println textMetadata.size()
+    }
     metadataWriter.close()
     if (ignored.size() > 0) {
     	File ignoredFile = new File (duplicates, "ignored.txt")
     	ignoredFile.withWriter("UTF-8") { writer ->
     		writer.println "TOTAL: "+ignored.size()
     		for (int i = 0 ; i < ignored.size() ; i++) {
     			def sign = ignored[i]
     			writer.println "\n**DUPLICATE\n "
     			writer.println "keeped="+dones[sign]
     			writer.println "duplicates="+ignoreds[i]
     			writer.println "SIGN="+sign
     			writer.println "\n"
+    		}
+    	}
     	println "TOTAL IGNORED: "+ignored.size()
     	println "	see $ignoredFile for text IDs"
+    }
     println "$itext articles found."
     if (debug) {
     	tmpXhtmlOutput.deleteDir()
     	tmpDir.deleteDir()
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro.debug
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcp.swt.widget.parameters.*
     import org.txm.core.preferences.TXMPreferences
     import org.txm.searchengine.cqp.CQPPreferences
     //org.txm.core.preferences.TXMPreferences.dump();
     println TXMPreferences.getString(CQPPreferences.CQI_SERVER_PATH_TO_CQPLIB, CQPPreferences.PREFERENCES_NODE);

     // STANDARD DECLARATIONS
     package org.txm.macroproto
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.ca.core.functions.CA
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.statsengine.r.core.RWorkspace
     // BEGINNING OF PARAMETERS
     if (!(corpusViewSelection instanceof CA)) {
     	println "Selection is not a CA. Please select a CA result in the Corpus view"
     	return;
+    }
     @Field @Option(name="outputFile", usage="an example file", widget="FileSave", required=true, def="file.svg")
     def outputFile
     @Field @Option(name="draw", usage="'row' or 'col'", widget="String", required=true, def="row")
     def draw
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     def ca = corpusViewSelection
     def s = ca.getSymbol()
     def RW = RWorkspace.getRWorkspaceInstance()
     def script = """
     plot($s);
     ellipseCA($s, ellipse=c("$draw"));
     """
     RW.plot(outputFile, script);
     println "Done: "+outputFile.getAbsolutePath()

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.functions.ca.CA
     import org.txm.stat.engine.r.RWorkspace
     // BEGINNING OF PARAMETERS
     if (!(corpusViewSelection instanceof CA)) {
     	println "selection is not a CA. Please select a CA result in the Corpus view"
     	return;
+    }
     @Field @Option(name="outputFile", usage="an example file", widget="FileSave", required=true, def="file.svg")
     def outputFile
     @Field @Option(name="draw", usage="'row' or 'col'", widget="String", required=true, def="row")
     def draw
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     def ca = corpusViewSelection
     def s = ca.getSymbol()
     def RW = RWorkspace.getRWorkspaceInstance()
     def script = """
     plot($s);
     ellipseCA($s, ellipse=c("$draw"));
     """
     RW.plot(outputFile, script);
     println "Done: "+outputFile.getAbsolutePath()

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import net.sf.saxon.*
     import net.sf.saxon.query.*
     import net.sf.saxon.om.*
     import javax.xml.transform.*
     import javax.xml.transform.sax.*
     import javax.xml.transform.stream.*
     import org.xml.sax.*
     import javax.xml.xpath.*
     import net.sf.saxon.event.*
     import org.w3c.dom.*
     import net.sf.saxon.s9api.*
     import javax.xml.parsers.*
     // BEGINNING OF PARAMETERS
     if (!(corpusViewSelection instanceof Corpus)) {
     	println "Error: Selection must be a corpus"
     	return false;
+    }
     @Field @Option(name="xqFile", usage="a Xquery file", widget="FileOpen", required=true, def="C:/Temp/foo.xq")
     def xqFile
     @Field @Option(name="outFile", usage="optional output file", widget="File", required=false, def="")
     def outFile
     @Field @Option(name="debug", usage="an example file", widget="Boolean", required=true, def="false")
     def debug
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     println "corpora selection: "+corpusViewSelection
     if (!xqFile.getName().endsWith(".xq")) {
     	println "Error: Xquery selected file is not a '.xd' file: $xdFile"
     	return false;
+    }
     MainCorpus mainCorpus = ((Corpus)corpusViewSelection).getMainCorpus();
     File binDir = mainCorpus.getBaseDirectory();
     File txmDir = new File(binDir, "txm/"+mainCorpus.getName());
     if (!txmDir.exists()) {
     	println "Error: the 'txm' directory does not exist: $txmDir"
     	return false;
+    }
     def xmlFiles = txmDir.listFiles();
     if (xmlFiles == null || xmlFiles.size() == 0) {
     	println "Error: no file found in $txmDir"
     	return false;
+    }
     String query = """<matches>
+      {
         for \$t in fn:collection('$txmDir')
           for \$w in \$t//tei:w
             let \$pos := \$w/txm:ana[@type="#frpos"]/text()
             return <match>{\$w/@id}</match>
+      }
     </matches>
     """
     Processor processor = new Processor(false)
     XQueryCompiler xqc = processor.newXQueryCompiler()
     xqc.declareNamespace("tei", "http://www.tei-c.org/ns/1.0")
     xqc.declareNamespace("txm", "http://textometrie.org/1.0")
     xqc.declareNamespace("fn", "http://www.w3.org/2005/xpath-functions")
     XQueryExecutable exp = xqc.compile(query)
     DocumentBuilderFactory dfactory = DocumentBuilderFactory.newInstance();
     dfactory.setNamespaceAware(true);
     Document dom = dfactory.newDocumentBuilder().newDocument();
     exp.load().run(new DOMDestination(dom));
     if (outFile instanceof File && outFile.getName().length() > 0) {
     	def writer = outFile.newWriter("UTF-8")
     	writer.println dom.getDocumentElement()
     	writer.close()
     	println "Result written in "+outFile.getAbsolutePath()
     } else {
     	println dom.getDocumentElement()
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.searchengine.cqp.corpus.*
     import org.txm.importer.ApplyXsl2;
     import groovy.util.XmlParser
     // BEGINNING OF PARAMETERS
     if (!(corpusViewSelection instanceof Corpus)) {
     	println "Error: Selection must be a corpus"
     	return false;
+    }
     @Field @Option(name="xslFile", usage="an example file", widget="FileOpen", required=true, def="C:/Temp/foo.txt")
     def xslFile
     @Field @Option(name="debug", usage="an example file", widget="Boolean", required=true, def="false")
     def debug
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     println "corpora selection: "+corpusViewSelection
     if (!xslFile.getName().endsWith(".xsl")) {
     	println "Error: XSL selected file is not a '.xsl' file: $xslFile"
     	return false;
+    }
     MainCorpus mainCorpus = ((Corpus)corpusViewSelection).getMainCorpus();
     File binDir = mainCorpus.getBaseDirectory();
     File txmDir = new File(binDir, "txm/"+mainCorpus.getName());
     File resultsDir = new File(binDir, "results");
     resultsDir.mkdirs()
     if (!txmDir.exists()) {
     	println "Error: the 'txm' directory does not exist: $txmDir"
     	return false;
+    }
     def xmlFiles = txmDir.listFiles();
     if (xmlFiles == null || xmlFiles.size() == 0) {
     	println "Error: no file found in $txmDir"
     	return false;
+    }
     HashSet<List<String>> allmatches = new HashSet<String>();
     ApplyXsl2 a = new ApplyXsl2(xslFile.getAbsolutePath());
     println "Querying..."
     for (File xmlFile : xmlFiles) {
     	println "	"+xmlFile.getName()
     	File resultFile = new File(resultsDir, "xslqueryresult_"+xmlFile.getName());
     	a.process(xmlFile, resultFile);
     	def matches = new XmlParser().parse(resultFile)
     	matches.match.each() { match ->
     		def l = [];
     		match.wRef.each() { l.add(it.attribute("id")); }
     		allmatches << l
+    	}
     	if (!debug) resultFile.delete()
+    }
     //println "Matches: "
     //for (def m : allmatches) println " "+m
     def subqueries = []
     for (def m : allmatches) {
     	if (m.size() == 1) subqueries <<  "[id=\""+m[0]+"\"]"
     	else if (m.size() == 2)
     		subqueries <<  "[id=\""+m[0]+"\"][]"
     	else
     		subqueries << "[id=\""+m[0]+"\"]"+"[]{"+(m.size-1)+"}"
+    }
     def query = subqueries.join("|")
     def initialquery = query
     while (query.length() > 1500) {
     	oldquery = query
     	query = query.substring(0, query.indexOf("|"))
     	println "Warning : query has been truncated: "+oldquery
+    }
     println "CQL: $query"
     if (debug) println "See debug files in: "+resultsDir
     if (initialquery != query) println "Initial CQL: $initialquery"

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.txm.searchengine.cqp.corpus.Partition;
     if (!(corpusViewSelection instanceof Partition)) {
     	println "Select a partition before calling this macro."
     	return;
+    }
     Partition p = corpusViewSelection
     for (def part : p.getParts()) {
     	println part.getName()+"\t"+part.getSize()
+    }

     // Copyright © 2016 ENS de Lyon, CNRS, University of Franche-Comté
     // Licensed under the terms of the GNU General Public License (http://www.gnu.org/licenses)
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import java.nio.file.Path
     import java.nio.file.attribute.FileOwnerAttributeView
     import java.nio.file.attribute.UserPrincipal
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.Toolbox
     import org.txm.searchengine.cqp.*
     import java.io.IOException;
     import java.nio.file.*
     import java.nio.file.attribute.*;
     // Parameter declaration - Déclaration du paramètre
     @Field @Option(name="directory", usage="the directory to diagnose", widget="Folder", required=true, def="set da enpos;")
     		File directory;
     // Parameters settings UI
     if (!ParametersDialog.open(this)) {
     	println("** ExecCQLMacro error: Impossible to open Parameters settings UI dialog box.")
     	return
+    }
     println "full path="+directory.getAbsolutePath()
     println " exists? "+directory.exists()
     println " read? "+directory.canRead()
     println " write? "+directory.canWrite()
     println " executable? "+directory.canExecute()
     println " hidden? "+directory.isHidden()
     def files = directory.listFiles()
     println " number of files? "+files.size()
     Path path = Paths.get(directory.getAbsolutePath());
     FileOwnerAttributeView ownerAttributeView = Files.getFileAttributeView(path, FileOwnerAttributeView.class);
     if (ownerAttributeView != null) {
     	UserPrincipal owner = ownerAttributeView.getOwner();
     	if (owner != null) println " file owner attribute: "+owner.getName()
+    }
     AclFileAttributeView aclAttributeView = Files.getFileAttributeView(path, AclFileAttributeView.class);
     if (aclAttributeView != null) {
     	List<AclEntry> acl = aclAttributeView.getAcl();
     	if (acl != null) {
     		for (AclEntry entry : acl)
     			if (acl != null) println " acl entry: "+entry
+    	}
+    }
     BasicFileAttributeView basicAttributeView = Files.getFileAttributeView(path, BasicFileAttributeView.class);
     if (basicAttributeView != null) {
     	BasicFileAttributes attributes = basicAttributeView.readAttributes()
     	if (attributes != null)  {
     		println " basic attributes: creation time: "+attributes.creationTime()
     		println " basic attributes: last access time: "+attributes.lastAccessTime()
     		println " basic attributes: last modification time: "+attributes.lastModifiedTime()
     		println " basic attributes: file key: "+attributes.fileKey()
     		println " basic attributes: directory file?: "+attributes.isDirectory()
     		println " basic attributes: symbolic link?: "+attributes.isSymbolicLink()
     		println " basic attributes: regular file?: "+attributes.isRegularFile()
+    	}
+    }
     DosFileAttributeView dosAttributeView = Files.getFileAttributeView(path, DosFileAttributeView.class);
     if (dosAttributeView != null) {
     	DosFileAttributes attributes = dosAttributeView.readAttributes()
     	if (attributes != null) {
     		println " dos attributes: creation time: "+attributes.creationTime()
     		println " dos attributes: last access time: "+attributes.lastAccessTime()
     		println " dos attributes: last modification time: "+attributes.lastModifiedTime()
     		println " dos attributes: file key: "+attributes.fileKey()
     		println " dos attributes: directory file?: "+attributes.isDirectory()
     		println " dos attributes: symbolic link?: "+attributes.isSymbolicLink()
     		println " dos attributes: regular file?: "+attributes.isRegularFile()
     		println " dos attributes: archive file?: "+attributes.isArchive()
     		println " dos attributes: system file?: "+attributes.isSystem()
+    	}
+    }
     PosixFileAttributeView posixAttributeView = Files.getFileAttributeView(path, PosixFileAttributeView.class);
     if (posixAttributeView != null) {
     	PosixFileAttributes attributes = posixAttributeView.readAttributes()
     	if (attributes != null) {
     		println " posix attributes: group: "+attributes.group()
     		println " posix attributes: creation time: "+attributes.creationTime()
     		println " posix attributes: last access time: "+attributes.lastAccessTime()
     		println " posix attributes: last modification time: "+attributes.lastModifiedTime()
     		println " posix attributes: file key: "+attributes.fileKey()
     		println " posix attributes: directory file?: "+attributes.isDirectory()
     		println " posix attributes: symbolic link?: "+attributes.isSymbolicLink()
     		println " posix attributes: regular file?: "+attributes.isRegularFile()
     		println " posix attributes: permissions: "+attributes.permissions().sort()
+    	}
+    }
     UserDefinedFileAttributeView userdefinedAttributeView = Files.getFileAttributeView(path, UserDefinedFileAttributeView.class);
     if (userdefinedAttributeView != null) {
     	def attributes = userdefinedAttributeView.list()
     	if (attributes != null) {
     		for (def entry : attributes)
     			println " user defined attributes: "+entry
+    	}
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     // BEGINNING OF PARAMETERS
     @Field @Option(name="file", usage="an example file", widget="FileOpen", required=true, def="C:/Temp/foo.txt")
     def file
     @Field @Option(name="read_right", usage="read", widget="Boolean", required=true, def="true")
     def read_right
     @Field @Option(name="write_right", usage="write", widget="Boolean", required=true, def="true")
     def write_right
     @Field @Option(name="execute_right", usage="execute", widget="Boolean", required=true, def="true")
     def execute_right
     @Field @Option(name="current_user_only", usage="read", widget="Boolean", required=true, def="false")
     def current_user_only
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     if (file.exists()) {
     file.setReadable(read_right, current_user_only);
     file.setWritable(write_right, current_user_only)
     file.setExecutable(execute_right, current_user_only);
     } else {
     	println "Error: file not found $file"
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     // BEGINNING OF PARAMETERS
     // Declare each parameter here
     // (available widget types: Query, File, Folder, String, Text, Boolean, Integer, Float and Date)
     @Field @Option(name="pythonFile", usage="an example file", widget="FileOpen", required=true, def="script.py")
     def pythonFile
     // Parameters settings UI
     if (!ParametersDialog.open(this)) {
     	println("** ExecCQLMacro error: Impossible to open Parameters settings UI dialog box.")
     	return
+    }
     def process = "python $pythonFile".execute()
     process.text.eachLine {println it}
     def exitValue = process.exitValue()
     if (exitValue != 0) println "Error during execution: $exitValue"

     package org.txm.macro.xml;
     // STANDARD DECLARATIONS
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.importer.ApplyXsl2;
     import javax.xml.transform.stream.*
     import javax.xml.transform.dom.DOMResult
     import org.w3c.dom.*
     // BEGINNING OF PARAMETERS
     @Field @Option(name="XSLFile", usage="an example file", widget="File", required=true, def="file.xsl")
     def XSLFile = new File(System.getProperty("user.home"),"TXM/xsl/identity.xsl")
     @Field @Option(name="intputDirectory", usage="an example folder", widget="Folder", required=true, def="in")
     def intputDirectory = new File(System.getProperty("user.home"),"xml/TESTS2/xml")
     //@Field @Option(name="parameters", usage="an example folder", widget="Text", required=false, def="")
     def parameters = [:]
     @Field @Option(name="dom", usage="XSLT Result is - true:  a DOM Element. false - a XSLT Result is XMLStreamReader", widget="Boolean", required=true, def="true")
     def dom
     @Field @Option(name="debug", usage="Show debug messages, value = true|false", widget="Boolean", required=true, def="false")
     def debug
     if (!ParametersDialog.open(this)) return;
     // END OF PARAMETERS
     // USER MANIPULATIONS
     def processDOMResult(File inputXMLFile, def resultnode) {
     	// with resultnode a Element : https://docs.oracle.com/javase/8/docs/api/org/w3c/dom/Element.html
     	println inputXMLFile.getName()+" -> "+ resultnode.getTagName()
+    }
     // END USER MANIPULATIONS
     println "Use XSL $XSLFile with parameters $parameters"
     println "Processed directory: $intputDirectory"
     def files = []
     ApplyXsl2 a = new ApplyXsl2(XSLFile.getAbsolutePath());
     intputDirectory.eachFileMatch(~/.+\.(xml|XML)/) { XMLFile ->
     	String name = XMLFile.getName()
     	try {
     		def result = process(a, XMLFile, [:]);
     		if (dom) processDOMResult(XMLFile, result.getNode().getDocumentElement());
     		else processSaxResult(XMLFile, result);
     		files << XMLFile
     	} catch (Exception e) {
     		println "Warning: XSL transformation of '$name' failed with error=$e with "
     		if (debug) e.printStackTrace();
+    	}
+    }
     def process(ApplyXsl2 a, File inputXMLFile, def args) throws Exception {
     	for (String k : args.keySet()) {
     		if (!this.setParam(k, args[k]))
     			return false;
+    	}
     	def result = null;
     	if (dom) result = new DOMResult();
     	else {
     		XMLStreamReader xmlreader = new XMLStreamReader();
     		PipedInputStream inpipe = new PipedInputStream(xmlreader
     		PipedOutputStream outpipe = new PipedOutputStream();
     		result = new StreamResult(new BufferedOutputStream(new FileOutputStream(xmloutfile)));
+    	}
     	a.transformer.transform(new StreamSource(inputXMLFile), result);
     	a.cleanMemory(); // save memory
     	a.resetParams()
     	return result;
+    }

Formats disponibles : Unified diff

Laboratoire ICAR » Plateforme TXM

Révision 1543