/ - Diff - Plateforme TXM - Forge du Centre Blaise Pascal

     // Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr
     // @author sheiden
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modify
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     // $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
     // $LastChangedRevision: 2378 $
     // $LastChangedBy: sheiden $
     //
     package org.txm.macro.csv
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import java.nio.charset.Charset
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.utils.*
     import org.txm.importer.*
     import javax.xml.stream.*
     import java.net.URL
     //import org.apache.xerces.util
     @Field @Option(name="inputFile",usage="CSV File", widget="File", required=false, def="/home/sheiden/Corpus/src/charlie/csv/fr_07jan.csv")
     File inputFile;
     @Field @Option(name="columnSeparator",usage="column columnSeparator", widget="String", required=false, def=",")
     def columnSeparator;
     @Field @Option(name="characterEncoding",usage="File characterEncoding", widget="String", required=false, def="UTF-8")
     def characterEncoding;
     @Field @Option(name="rootTag",usage="Root tag name", widget="String", required=false, def="tweets")
     def rootTag;
     @Field @Option(name="textTag",usage="Text tag name", widget="String", required=false, def="tweet")
     def textTag;
     @Field @Option(name="metadataColumnList",usage="metadataColumnList column list separated with comma", widget="String", required=false, def="tweet-id-str,job-id,created-at,from-user,from-user-id-str,from-user-name,from-user-fullname,from-user-followers,from-user-following,from-user-favorites,from-user-tweets,from-user-timezone,to-user,to-user-id-str,to-user-name,source,location-geo,location-geo-0,location-geo-1,iso-language,analysis-state")
     def metadataColumnList;
     @Field @Option(name="textColumnList",usage="textColumnList column list separated with comma", widget="String", required=false, def="text")
     def textColumnList;
     if (!ParametersDialog.open(this)) return;
     metadataColumnList = metadataColumnList.split(",")
     textColumnList = textColumnList.split(",")
     textTag = textTag.trim()
     rootTag = rootTag.trim()
     CsvReader reader = new CsvReader(inputFile.getAbsolutePath(), columnSeparator.charAt(0), Charset.forName(characterEncoding));
     if (!reader.readHeaders()) {
     	println "Error: no header"
     	return
+    }
     def headers = Arrays.asList(reader.getHeaders())
     headers.eachWithIndex {str, index -> headers[index] = str.replaceAll(/_/, "-") }
     reader.setHeaders(headers as String[])
     println "Root tag: $rootTag"
     println "Text tag: $textTag"
     println "Metadata column(s): $metadataColumnList"
     println "Text column(s): $textColumnList"
     println "Processing..."
     def ok = true
     for (String m : metadataColumnList) {
     	m = m.trim()
     	if (!headers.contains(m)) {
     		println "Error: missing metadata column named '$m'"
     		ok = false
+    	}
+    }
     for (String t : textColumnList) {
     	t = t.trim()
     	if (!headers.contains(t)) {
     		println "Error: missing text column named '$t'"
     		ok = false
+    	}
+    }
     if (!ok) {
     	println "** CSV2XML: Column(s) missing, aborting..."
     	return
+    }
     String name = inputFile.getName()
     int idx = name.indexOf(".")
     if (idx > 0) name = name.substring(0, idx)
     File outputfile = new File(inputFile.getParentFile(), name+".xml")
     XMLOutputFactory factory = XMLOutputFactory.newInstance()
     Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8")
     XMLStreamWriter writer = factory.createXMLStreamWriter(output)
     writer.writeStartDocument("UTF-8", "1.0")
     writer.writeCharacters("\n")
     writer.writeStartElement(rootTag)
     writer.writeCharacters("\n")
     int pb_n = 1
     def tweet = [:]
     def isAllValidXmlChars = { s ->
     // xml 1.1 spec http://en.wikipedia.org/wiki/Valid_characters_in_XML
     if (!s ==~ /[\u0001-\uD7FF\uE000-\uFFFD\x{10000}-\x{10FFFF}]/) {
       // not in valid ranges
       return false
+    }
     if (s ==~ /[\u0001-\u0008\u000b-\u000c\u000E-\u001F\u007F-\u0084\u0086-\u009F]/) {
       // a control character
       return false
+    }
     // "Characters allowed but discouraged"
     if (s ==~ /[\uFDD0-\uFDEF\x{1FFFE}-\x{1FFFF}\x{2FFFE}–\x{2FFFF}\x{3FFFE}–\x{3FFFF}\x{4FFFE}–\x{4FFFF}\x{5FFFE}-\x{5FFFF}\x{6FFFE}-\x{6FFFF}\x{7FFFE}-\x{7FFFF}\x{8FFFE}-\x{8FFFF}\x{9FFFE}-\x{9FFFF}\x{AFFFE}-\x{AFFFF}\x{BFFFE}-\x{BFFFF}\x{CFFFE}-\x{CFFFF}\x{DFFFE}-\x{DFFFF}\x{EFFFE}-\x{EFFFF}\x{FFFFE}-\x{FFFFF}\x{10FFFE}-\x{10FFFF}]/) {
       return false
+    }
     return true
+    }
     def stripInvalidXmlCharacters = { input ->
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < input.length(); i++) {
             char c = input.charAt(i);
     //        if (XMLChar.isValid(c)) {
             if (isAllValidXmlChars(c)) {
                 sb.append(c);
+            }
+        }
         return sb.toString();
+    }
     def tokenizeAs
     tokenizeAs = { source, matchers ->
     	if (matchers.size() > 0) {
     		def inputPattern = matchers[0]
     		def outputElement = matchers[1]
     		def outputAttribute = matchers[2]
     		def outputAttValue = matchers[3]
     		matchers = matchers.drop(4)
     		def m = (source =~ inputPattern)
     		def lastMatch = 0
     		while(m.find()) {
     		   tokenizeAs(source.substring(lastMatch, m.start()), matchers)
     		   if (outputAttribute == "type" && outputAttValue == "url") {
     		   		writer.writeStartElement("a")
     		   		writer.writeAttribute("href", m.group())
+    		   }
     		   writer.writeStartElement(outputElement)
     		   writer.writeAttribute(outputAttribute, outputAttValue)
     		   writer.writeAttribute("frpos", "NAM")
     		   writer.writeAttribute("frlemma", m.group())
     		   writer.writeCharacters(m.group())
     		   writer.writeEndElement()
     		   if (outputAttribute == "type" && outputAttValue == "url") {
     		   		writer.writeEndElement()
+    		   }
     		   lastMatch = m.end()
+    		}
     		tokenizeAs(source.substring(lastMatch), matchers)
     	} else {
     		writer.writeCharacters(source)
+    	}
+    }
     while (reader.readRecord()) {
         writer.writeCharacters("  ")
     	writer.writeEmptyElement("pb") // <pb/>
     	writer.writeAttribute("n", ""+pb_n++)
     	writer.writeCharacters("\n")
         writer.writeCharacters("  ")
     	writer.writeStartElement(textTag)
     	for (String m : metadataColumnList) {
     		m = m.trim()
     		writer.writeAttribute(m, CleanFile.clean(reader.get(m)))
+    	}
         writer.writeCharacters("\n")
     	for (String t : textColumnList) {
     		t = t.trim()
         	writer.writeCharacters("    ")
     		writer.writeStartElement(t)
     		// look for re-tweets
     		def c = reader.get(t)
     		def retweet = false
     		def rt_pattern = ~/(?s:^RT @[^:]+: (.*)$)/
     		def rt_m = (c =~ rt_pattern)
     		if (rt_m) retweet = true
     		if (retweet) { // it is a re-tweet
     			// take off and count all re-tweet prefixes
     			def nPrefix = 0
     			while (retweet) {
     				c = rt_m.group(1)
     				rt_m = (c =~ rt_pattern)
     				if (rt_m) { retweet = true } else { retweet = false }
     				nPrefix++
+    			}
     			def crt = sprintf("RT-%d ", nPrefix)+c
     			def n = tweet.get(crt)
     			if (n) {
     				tweet.put(crt, n+1)
     				} else {
     				tweet.put(crt, 1)
+    			}
     		} else {
     			tweet.put(c, 1)
     			tokenizeAs(CleanFile.clean(c),\
     					   [/@\p{L}+/, "w", "type", "attag",\
     						/#\p{L}+/, "w", "type", "hashtag",\
     						/((http|ftp|https):\/\/[\p{L}\-_]+(\.[\p{L}\-_]+)+([\p{L}\-\.,@?^=%&amp;:\/~\+#]*[\p{L}\-\@?^=%&amp;\/~\+#])?)/, "w", "type", "url"])
+    		}
     		writer.writeEndElement() // t
     		writer.writeCharacters("\n")
+    	}
         writer.writeCharacters("  ")
     	writer.writeEndElement() // textTag
     	writer.writeCharacters("\n")
+    }
     writer.writeEndElement() // rootTag
     writer.writeCharacters("\n")
     writer.close()
     output.close()
     reader.close()
     def nt = tweet.size()
     println sprintf("\n%d tweets uniques sur %d lus", nt, pb_n-1)
     if (nt > 50) {
     	nt = 50
     	println "Printing first 50 most frequent [re-]tweets:"
+    }
     // take(nt).
     tweet.sort { -it.value }.each { entry ->
     	def l = entry.key.length()
     	if (l > 0) {
     //		if (entry.key.find("On n'a pas peur !!")) {
     		println sprintf("[%d] %s...", entry.value, entry.key[0..(Math.min(entry.key.length(), 100)-1)])
     //		}
     	} else {
     	println sprintf("Empty tweet content occurring %d times.", entry.value)
+    	}
+    }

     package org.txm.macro.txt
     // Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr
     // @author sheiden
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modify
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     // $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
     // $LastChangedRevision: 2378 $
     // $LastChangedBy: sheiden $
     //
     // STANDARD DECLARATIONS
     // imports
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     // PARAMETERS
     // **change this parameter**
     @Field @Option(name="inputDirectory", usage="Dossier contenant les fichiers à modifier", widget="Folder", required=true, def='/home')
     		inputDirectory = new File(System.getProperty("user.home"), "Bureau/voeux")
     // **change this parameter**
     @Field @Option(name="extension", usage="extension des fichiers à modifier (expression régulière)", widget="String", required=true, def='\\.txt')
     		extension = "\\.txt"
     // **change this parameter**
     @Field @Option(name="find", usage="Expression régulière à chercher", widget="String", required=true, def='’')
     		find = "’"
     // **change this parameter**
     @Field @Option(name="replaceWith", usage="Chaîne de remplacement", widget="String", required=false, def='\'')
     		replaceWith = "'"
     // **change this parameter**
     @Field @Option(name="encoding", usage="Encodage des caractères des fichiers", widget="String", required=true, def='UTF-8')
     		encoding = "utf-8"
     // PARAMETERS DIALOG
     if (!ParametersDialog.open(this)) return;
     // SANITY CHECK
     if (inputDirectory==null || find==null || replaceWith==null) { println "** MultiLineSearchReplaceInDirectoryMacro: the input directory, the regular expression to search and replacement must be specified."; return}
     if (!inputDirectory.exists()) { println "** MultiLineSearchReplaceInDirectoryMacro: impossible to access the '$inputDirectory' input directory."; return}
     // MAIN BODY
     println "-- working in $inputDirectory directory with files of '$extension' extension"
     println "-- replacing '$find' with '$replaceWith'"
     // find = /date="([0-9]+)-([0-9]+-[0-9]+)"/
     // **change this parameter**
     // replaceWith = 'date="$1-$2" year="$1"'
     // **change this parameter** (warning: '$1', '$2'... can be interpreted by Groovy in "..." strings)
     //
     // RECETTE
     // <tei:pb [^>]+>\n   <tei:p>[0-9]+</tei:p>\n   <tei:p>[^ ]+ 18[5-6][0-9]\.</tei:p>\n
     def p = /$find/
     inputDirectory.eachFileMatch(~/.*$extension/) { file ->               // for each file matching extension
     	println "\n-- processing: "+file.getName()
     	def tmp = File.createTempFile("SearchReplaceInDirectoryTemp", ".tmp", file.getParentFile()) // create temporary file
     	tmp.withWriter(encoding) { writer ->
     		writer.print(file.getText(encoding).replaceAll(p, replaceWith))
+    	}
     	file.delete()
     	tmp.renameTo(file)
+    }

     package org.txm.macro.line
     // Copyright © 2018 - ENS de Lyon - http://textometrie.ens-lyon.fr
     // @author sheiden
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modify
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     // $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
     // $LastChangedRevision: 2378 $
     // $LastChangedBy: sheiden $
     //
     // imports
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     // parameters
     // **change this parameter**
     @Field @Option(name="inputFile",usage="fichier Taltac", widget="File", required=true, def='')
     def inputFile
     // **change this parameter**
     @Field @Option(name="characterEncoding",usage="Système d'encodage des caractères utilisé pour le fichier", widget="String", required=true, def='UTF-8')
     def characterEncoding = "UTF-8"
     @Field @Option(name="titleTag", usage="name of title ++++ tag", widget="String", required=false, def="titolo")
     def titleTag
     @Field @Option(name="addParagraphs", usage="encode each text line as a paragraph", widget="Boolean", required=true, def="true")
     def addParagraphs
     if (!ParametersDialog.open(this)) return
     name = inputFile.getName()
     idx = name.lastIndexOf(".")
     if (idx > 0) name = name.substring(0, idx)
     outputFile = new File(inputFile.getParentFile(), name+".xml")
     pageNum = 2
     outputFile.write('') // create empty file
     outputFile.withWriter() { writer ->
     	writer.println '<?xml version="1.0" encoding="UTF-8"?>\n<taltac>'
     	def firstDoc = true
     	def inTitle = false
     	def currentTag = ""
     	def metadata = "<hi>no metadata</hi>"
     	def metadataEdited = false
     	inputFile.eachLine(characterEncoding) { line ->
     		line = line.replaceAll("&", "&amp;")
     		line = line.replaceAll("<", "&lt;")
     		// ****yahoobanque1 *data=31gen *autore=da *rubrica=da *ora=08 *agenzia=reuters *grafici=da
     		group = (line =~ /^\*\*\*\*([^ ]+) (.*)/)
     		if (group) {
     			ident = group[0][1]
     			properties = group[0][2]
     			metadata=properties.replaceAll(/ ?\*([^=]+)=([^ ]+)/) { all2, propname, propvalue ->
     				"<item><hi>$propname</hi>: $propvalue</item>\n"
+    			}
     			metadata="<list type=\"unordered\">\n"+metadata+"</list>"
     			metadataEdited = false
     			properties=properties.replaceAll(/\*([^=]+)=([^ ]+)/) { all2, propname, propvalue ->
     				"$propname=\"$propvalue\""
+    			}
     			if (firstDoc) {
     				firstDoc = false
     				writer.println "<doc ident=\"$ident\" $properties>"
     			}else{
     				if (currentTag.size() > 0) {
     					if (titleTag.size() > 0) {
     						if (inTitle) {
     							writer.println sprintf("</%s>", currentTag)
     							writer.println "</head>"
     							inTitle = false
     							currentTag = ""
     						}else{
     							writer.println sprintf("</%s>", currentTag)
     							currentTag = ""
+    						}
     					}else{
     						writer.println sprintf("</%s>", currentTag)
     						currentTag = ""
+    					}
     /*
     					if (!metadataEdited) {
     println sprintf ("ident = %s, currentTag = %s (%d), metadata", ident, currentTag, currentTag.size())
     						writer.println metadata
     						metadataEdited = true
+    					}
     */
+    				}
     				writer.println "</doc>\n<pb n=\"${pageNum++}\"/>\n<doc ident=\"$ident\" $properties>"
+    			}
     		}else{
     			// ++++titolo
     			group = (line =~ /^\+\+\+\+(.+)/)
     			if (group) {
     				tag = group[0][1]
     				if (currentTag.size() > 0) {
     					if (titleTag.size() > 0) {
     						if (inTitle) {
     							writer.println sprintf("</%s>", currentTag)
     							writer.println "</head>"
     							inTitle = false
     							currentTag = ""
     						}else{
     							writer.println sprintf("</%s>", currentTag)
     							currentTag = ""
+    						}
     					}else{
     						writer.println sprintf("</%s>", currentTag)
     						currentTag = ""
+    					}
     					if (!metadataEdited) {
     						writer.println metadata
     						metadataEdited = true
+    					}
+    				}
     				if (titleTag.size() > 0) {
     					if (tag == titleTag) {
     						writer.println "<head>"
     						writer.println sprintf("<%s>", tag)
     						inTitle = true
     						currentTag = tag
     					}else{
     						writer.println sprintf("<%s>", tag)
     						currentTag = tag
+    					}
     				}else{
     					writer.println sprintf("<%s>", tag)
     					currentTag = tag
+    				}
     			}else{
     				// normal line
     				if (addParagraphs) {
     					writer.println "<p>$line</p>"
     				}else{
     					writer.println "$line"
+    				}
+    			}
+    		}
+    	}
     	writer.println sprintf("</%s>\n</doc>\n</taltac>\n", currentTag)
     	writer.close()
+    }

     // Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr
     // @author sheiden
     //
     // This file is part of the TXM platform.
     //
     // The TXM platform is free software: you can redistribute it and/or modify
     // it under the terms of the GNU General Public License as published by
     // the Free Software Foundation, either version 3 of the License, or
     // (at your option) any later version.
     //
     // The TXM platform is distributed in the hope that it will be useful,
     // but WITHOUT ANY WARRANTY; without even the implied warranty of
     // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     // GNU General Public License for more details.
     //
     // You should have received a copy of the GNU General Public License
     // along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
     //
     // $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
     // $LastChangedRevision: 2378 $
     // $LastChangedBy: sheiden $
     //
     // STANDARD DECLARATIONS
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import java.nio.charset.Charset
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.utils.*
     import javax.xml.stream.*
     import java.net.URL
     import java.util.regex.Matcher;
     import java.util.regex.Pattern;
     // PARAMETERS
     @Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=false, def="")
     File inputDirectory;
     @Field @Option(name="regexp", usage="Regular expression to match", widget="String", required=false, def="")
     String regexp;
     @Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8")
     String encoding;
     // PARAMETERS
     if (inputDirectory == null) {
     	if (!ParametersDialog.open(this)) return;
+    }
     // SANITY CHECK
     if (inputDirectory==null || regexp==null) { println "** FindMultiLineRegExpMacro: the input directory and the regular expression must be specified."; return}
     if (!inputDirectory.exists()) { println "** FindMultiLineRegExpMacro: impossible to access the '$inputDirectory' input directory."; return}
     // MAIN BODY
     def p = /$regexp/
     println "-- looking for: $regexp"
     for (File inputfile : inputDirectory.listFiles()) {
     	if (inputfile.isDirectory()) continue // ignore
     	def matcher = inputfile.getText() =~ p
     	println "\n-- file:"+inputfile.getName() + " "+matcher.size() + " match" +((matcher.size() > 1)?"s":"")
     	if (matcher.size() > 0) {
     		matcher.each { println it }
+    	}
+    }

     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import java.nio.charset.Charset
     import org.txm.rcpapplication.swt.widget.parameters.*
     import org.txm.utils.*
     import javax.xml.stream.*
     import java.net.URL
     import java.util.regex.Matcher;
     import java.util.regex.Pattern;
     @Field @Option(name="inputDirectory",usage="TXT directory", widget="Folder", required=false, def="directory path")
     File inputDirectory;
     @Field @Option(name="regexp",usage="Regular expression to match", widget="String", required=false, def="regular expression")
     String regexp;
     @Field @Option(name="encoding",usage="File encoding", widget="String", required=false, def="UTF-8")
     String encoding;
     if (!ParametersDialog.open(this)) return;
     def p = /$regexp/
     println "processing: "+inputDirectory.listFiles()
     for (File inputfile : inputDirectory.listFiles()) {
     	if (inputfile.isDirectory()) continue // ignore
     	def lines = []
     	inputfile.eachLine("UTF-8") { line, n ->
     		def m = line =~ p
     		if ( m.size() > 0) {
     			lines << "  line $n: $line"
+    		}
+    	}
     	println inputfile.getName() + " "+lines.size() + " match" +((lines.size() > 1)?"s":"")
     	if (lines.size() > 0) {
     		for (String s : lines ) println s
+    	}
+    }

     // STANDARD DECLARATIONS
     package org.txm.macro
     import org.kohsuke.args4j.*
     import groovy.transform.Field
     import org.txm.rcpapplication.swt.widget.parameters.*
     import ims.tiger.importfilter.*
     import ims.tiger.importfilter.bracketing.*
     import org.apache.log4j.*
     // BEGINNING OF PARAMETERS
     // Declare each parameter here
     // (available widget types: Query, File, Folder, String, Text, Boolean, Integer, Float and Date)
     @Field @Option(name="SourceFile", usage="SourceFilename", widget="File", required=false, def="input.mrg")
     def SourceFile
     @Field @Option(name="TargetFile", usage="TargetFilename", widget="File", required=false, def="output.xml")
     def TargetFile
     @Field @Option(name="XMLTargetID", usage="XMLTargetID", widget="String", required=false, def="XMLTargetID")
     def XMLTargetID
     @Field @Option(name="SourceFolder", usage="SourceFolder (leave empty if SourceFile set)", widget="Folder", required=false, def="input <remove if SourceFile set>")
     def SourceFolder
     @Field @Option(name="TargetFolder", usage="TargetFolder (leave empty if TargetFile set)", widget="Folder", required=false, def="output")
     def TargetFolder
     @Field @Option(name="MaximumNumberOfSentences", usage="MaximumNumberOfSentences", widget="Integer", required=true, def="0")
     def MaximumNumberOfSentences
     // Open the parameters input dialog box
     if (!ParametersDialog.open(this)) return
     // END OF PARAMETERS
     BasicConfigurator.configure()
     ImportFilter filter = new MyUPennFilter()
     ImportFilterHandler handler = new SilentImportFilterHandler()
     filter.setImportFilterHandler(handler)
     filter.setCompression(false)
     filter.setSchemaFilename(System.getProperty("user.home")+"/TXM/xml/xsd/tiger/TigerXML.xsd")
     filter.setMaximumNumberOfSentences(MaximumNumberOfSentences-1)
     if (SourceFolder == null) {
     	def SourceFilename = SourceFile.getAbsolutePath()
     	def TargetFilename = TargetFile.getAbsolutePath()
     	filter.setSourceFilename(SourceFilename)
     	filter.setXMLTargetFilename(TargetFilename)
     	filter.setXMLTargetID(XMLTargetID)
     	println "Converting '$SourceFilename' to '$TargetFilename'"
     	try {
       			filter.startConversion()
     		} catch (Exception e)
     		{ e.printStackTrace()
+    	}
     } else {
     	for (File inputfile : SourceFolder.listFiles()) {
     		if (inputfile.isDirectory()) continue // ignore
     		fileNameE = inputfile.getName()
     		i = fileNameE.lastIndexOf('.')
     		if (i > 0) extension = fileNameE.substring(i+1) else extension = ""
     		fileName = fileNameE.substring(0, i)
     		fileId = fileName
     		input = inputfile.getAbsolutePath()
     		output = TargetFolder.getAbsolutePath()+"/"+fileName+".xml"
     		println "Converting '"+input+"' to '"+output+"'"
     		filter.setSourceFilename(input)
     		filter.setXMLTargetFilename(output)
     		filter.setXMLTargetID(fileId)
     		try {
       				filter.startConversion()
     			} catch (Exception e)
     			{ e.printStackTrace()
+        	}
+    	}
+    }

     PennTreebank2TIGER macro
     ========================
 -07, sheiden
     * Install
     - unzip PennTreebank2TIGER.zip
     - copy TigerXML.xsd in $HOME/TXM/xml/xsd/tiger/TigerXML.xsd
     - copy PennTreebank2TIGERMacro.groovy in $HOME/TXM/scripts/macro/org/txm/macro
     - refresh macro view
     * Parameters
     - SourceFile: drives one file conversion process mode
     - SourceFolder: drives several files in a folder conversion process mode
     -> fill only the field to be used
     - TargetFile: if SourceFile then use that target file
     - XMLTargetID: if SourceFile then use that target ID
     - TargetFolder: : if SourceFolder then use that target folder to save targets
      - target files are named from the source filename with the extension stripped and '.xml' added
      - target ID are named from the source filename with the extension stripped
     - MaximumNumberOfSentences: maximum number of sentences to convert per file (set to '0' to convert all sentences)

     <?xml version="1.0"?>
     <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
      <!-- ==================================================================
           XML Schema for the TIGER-XML format
           http://www.ims.uni-stuttgart.de/projekte/TIGER/public/TigerXML.xsd
           ==================================================================
           TIGER Project, Wolfgang Lezius
           IMS, University of Stuttgart, 04/01/2003
           ================================================================== -->
       <!-- ======================================================
            INCLUDES DECLARATION OF THE HEADER
            ====================================================== -->
       <xsd:include schemaLocation="TigerXMLHeader.xsd" />
       <!-- ======================================================
            INCLUDES DECLARATION OF SUBCORPORA AND SENTENCES
            ====================================================== -->
       <xsd:include schemaLocation="TigerXMLSubcorpus.xsd" />
       <!-- ======================================================
            DECLARATION OF THE CORPUS DOCUMENT
            ====================================================== -->
       <!-- declaration of the root element: corpus -->
       <xsd:element name="corpus">
         <xsd:complexType>
           <xsd:sequence>
             <xsd:choice>
                <!-- header of the document is optional -->
                <xsd:element name="head" type="headType" minOccurs="0" maxOccurs="1"/>
             </xsd:choice>
             <xsd:element name="body" type="bodyType" minOccurs="1" maxOccurs="1"/>
           </xsd:sequence>
           <!-- corpus ID -->
           <xsd:attribute name="id"      type="idType"     use="required" />
           <!-- optional attribute: TigerXML version; used by TIGERSearch only -->
           <xsd:attribute name="version" type="xsd:string" use="optional" />
         </xsd:complexType>
       </xsd:element>
       <!-- declaration of the body type -->
       <xsd:complexType name="bodyType">
         <xsd:choice minOccurs="1" maxOccurs="unbounded">
           <xsd:element name="subcorpus" type="subcorpusType" minOccurs="1" maxOccurs="1"/>
           <xsd:element name="s" type="sentenceType" minOccurs="1" maxOccurs="1"/>
         </xsd:choice>
       </xsd:complexType>
     </xsd:schema>

Laboratoire ICAR » Plateforme TXM

Révision 3950