Révision 3950
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/csv/CSV2XMLTweetAwareMacro.groovy (revision 3950) | ||
|---|---|---|
| 1 |
// Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
| 2 |
// @author sheiden |
|
| 3 |
// |
|
| 4 |
// This file is part of the TXM platform. |
|
| 5 |
// |
|
| 6 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
| 7 |
// it under the terms of the GNU General Public License as published by |
|
| 8 |
// the Free Software Foundation, either version 3 of the License, or |
|
| 9 |
// (at your option) any later version. |
|
| 10 |
// |
|
| 11 |
// The TXM platform is distributed in the hope that it will be useful, |
|
| 12 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 13 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 14 |
// GNU General Public License for more details. |
|
| 15 |
// |
|
| 16 |
// You should have received a copy of the GNU General Public License |
|
| 17 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
| 18 |
// |
|
| 19 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
| 20 |
// $LastChangedRevision: 2378 $ |
|
| 21 |
// $LastChangedBy: sheiden $ |
|
| 22 |
// |
|
| 23 |
|
|
| 24 |
package org.txm.macro.csv |
|
| 25 |
|
|
| 26 |
import org.kohsuke.args4j.* |
|
| 27 |
import groovy.transform.Field |
|
| 28 |
import java.nio.charset.Charset |
|
| 29 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 30 |
import org.txm.utils.* |
|
| 31 |
import org.txm.importer.* |
|
| 32 |
import javax.xml.stream.* |
|
| 33 |
import java.net.URL |
|
| 34 |
//import org.apache.xerces.util |
|
| 35 |
|
|
| 36 |
@Field @Option(name="inputFile",usage="CSV File", widget="File", required=false, def="/home/sheiden/Corpus/src/charlie/csv/fr_07jan.csv") |
|
| 37 |
File inputFile; |
|
| 38 |
|
|
| 39 |
@Field @Option(name="columnSeparator",usage="column columnSeparator", widget="String", required=false, def=",") |
|
| 40 |
def columnSeparator; |
|
| 41 |
|
|
| 42 |
@Field @Option(name="characterEncoding",usage="File characterEncoding", widget="String", required=false, def="UTF-8") |
|
| 43 |
def characterEncoding; |
|
| 44 |
|
|
| 45 |
@Field @Option(name="rootTag",usage="Root tag name", widget="String", required=false, def="tweets") |
|
| 46 |
def rootTag; |
|
| 47 |
|
|
| 48 |
@Field @Option(name="textTag",usage="Text tag name", widget="String", required=false, def="tweet") |
|
| 49 |
def textTag; |
|
| 50 |
|
|
| 51 |
@Field @Option(name="metadataColumnList",usage="metadataColumnList column list separated with comma", widget="String", required=false, def="tweet-id-str,job-id,created-at,from-user,from-user-id-str,from-user-name,from-user-fullname,from-user-followers,from-user-following,from-user-favorites,from-user-tweets,from-user-timezone,to-user,to-user-id-str,to-user-name,source,location-geo,location-geo-0,location-geo-1,iso-language,analysis-state") |
|
| 52 |
def metadataColumnList; |
|
| 53 |
|
|
| 54 |
@Field @Option(name="textColumnList",usage="textColumnList column list separated with comma", widget="String", required=false, def="text") |
|
| 55 |
def textColumnList; |
|
| 56 |
|
|
| 57 |
if (!ParametersDialog.open(this)) return; |
|
| 58 |
|
|
| 59 |
metadataColumnList = metadataColumnList.split(",")
|
|
| 60 |
textColumnList = textColumnList.split(",")
|
|
| 61 |
textTag = textTag.trim() |
|
| 62 |
rootTag = rootTag.trim() |
|
| 63 |
|
|
| 64 |
CsvReader reader = new CsvReader(inputFile.getAbsolutePath(), columnSeparator.charAt(0), Charset.forName(characterEncoding)); |
|
| 65 |
if (!reader.readHeaders()) {
|
|
| 66 |
println "Error: no header" |
|
| 67 |
return |
|
| 68 |
} |
|
| 69 |
|
|
| 70 |
def headers = Arrays.asList(reader.getHeaders()) |
|
| 71 |
headers.eachWithIndex {str, index -> headers[index] = str.replaceAll(/_/, "-") }
|
|
| 72 |
reader.setHeaders(headers as String[]) |
|
| 73 |
|
|
| 74 |
println "Root tag: $rootTag" |
|
| 75 |
println "Text tag: $textTag" |
|
| 76 |
println "Metadata column(s): $metadataColumnList" |
|
| 77 |
println "Text column(s): $textColumnList" |
|
| 78 |
println "Processing..." |
|
| 79 |
|
|
| 80 |
def ok = true |
|
| 81 |
for (String m : metadataColumnList) {
|
|
| 82 |
m = m.trim() |
|
| 83 |
if (!headers.contains(m)) {
|
|
| 84 |
println "Error: missing metadata column named '$m'" |
|
| 85 |
ok = false |
|
| 86 |
} |
|
| 87 |
} |
|
| 88 |
for (String t : textColumnList) {
|
|
| 89 |
t = t.trim() |
|
| 90 |
if (!headers.contains(t)) {
|
|
| 91 |
println "Error: missing text column named '$t'" |
|
| 92 |
ok = false |
|
| 93 |
} |
|
| 94 |
} |
|
| 95 |
|
|
| 96 |
if (!ok) {
|
|
| 97 |
println "** CSV2XML: Column(s) missing, aborting..." |
|
| 98 |
return |
|
| 99 |
} |
|
| 100 |
|
|
| 101 |
String name = inputFile.getName() |
|
| 102 |
int idx = name.indexOf(".")
|
|
| 103 |
if (idx > 0) name = name.substring(0, idx) |
|
| 104 |
File outputfile = new File(inputFile.getParentFile(), name+".xml") |
|
| 105 |
|
|
| 106 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
| 107 |
Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8") |
|
| 108 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output) |
|
| 109 |
|
|
| 110 |
writer.writeStartDocument("UTF-8", "1.0")
|
|
| 111 |
writer.writeCharacters("\n")
|
|
| 112 |
writer.writeStartElement(rootTag) |
|
| 113 |
writer.writeCharacters("\n")
|
|
| 114 |
|
|
| 115 |
int pb_n = 1 |
|
| 116 |
def tweet = [:] |
|
| 117 |
|
|
| 118 |
def isAllValidXmlChars = { s ->
|
|
| 119 |
// xml 1.1 spec http://en.wikipedia.org/wiki/Valid_characters_in_XML |
|
| 120 |
if (!s ==~ /[\u0001-\uD7FF\uE000-\uFFFD\x{10000}-\x{10FFFF}]/) {
|
|
| 121 |
// not in valid ranges |
|
| 122 |
return false |
|
| 123 |
} |
|
| 124 |
if (s ==~ /[\u0001-\u0008\u000b-\u000c\u000E-\u001F\u007F-\u0084\u0086-\u009F]/) {
|
|
| 125 |
// a control character |
|
| 126 |
return false |
|
| 127 |
} |
|
| 128 |
|
|
| 129 |
// "Characters allowed but discouraged" |
|
| 130 |
if (s ==~ /[\uFDD0-\uFDEF\x{1FFFE}-\x{1FFFF}\x{2FFFE}–\x{2FFFF}\x{3FFFE}–\x{3FFFF}\x{4FFFE}–\x{4FFFF}\x{5FFFE}-\x{5FFFF}\x{6FFFE}-\x{6FFFF}\x{7FFFE}-\x{7FFFF}\x{8FFFE}-\x{8FFFF}\x{9FFFE}-\x{9FFFF}\x{AFFFE}-\x{AFFFF}\x{BFFFE}-\x{BFFFF}\x{CFFFE}-\x{CFFFF}\x{DFFFE}-\x{DFFFF}\x{EFFFE}-\x{EFFFF}\x{FFFFE}-\x{FFFFF}\x{10FFFE}-\x{10FFFF}]/) {
|
|
| 131 |
return false |
|
| 132 |
} |
|
| 133 |
|
|
| 134 |
return true |
|
| 135 |
} |
|
| 136 |
|
|
| 137 |
|
|
| 138 |
def stripInvalidXmlCharacters = { input ->
|
|
| 139 |
StringBuilder sb = new StringBuilder(); |
|
| 140 |
for (int i = 0; i < input.length(); i++) {
|
|
| 141 |
char c = input.charAt(i); |
|
| 142 |
// if (XMLChar.isValid(c)) {
|
|
| 143 |
if (isAllValidXmlChars(c)) {
|
|
| 144 |
sb.append(c); |
|
| 145 |
} |
|
| 146 |
} |
|
| 147 |
|
|
| 148 |
return sb.toString(); |
|
| 149 |
} |
|
| 150 |
|
|
| 151 |
|
|
| 152 |
def tokenizeAs |
|
| 153 |
tokenizeAs = { source, matchers ->
|
|
| 154 |
if (matchers.size() > 0) {
|
|
| 155 |
def inputPattern = matchers[0] |
|
| 156 |
def outputElement = matchers[1] |
|
| 157 |
def outputAttribute = matchers[2] |
|
| 158 |
def outputAttValue = matchers[3] |
|
| 159 |
matchers = matchers.drop(4) |
|
| 160 |
|
|
| 161 |
def m = (source =~ inputPattern) |
|
| 162 |
def lastMatch = 0 |
|
| 163 |
while(m.find()) {
|
|
| 164 |
tokenizeAs(source.substring(lastMatch, m.start()), matchers) |
|
| 165 |
if (outputAttribute == "type" && outputAttValue == "url") {
|
|
| 166 |
writer.writeStartElement("a")
|
|
| 167 |
writer.writeAttribute("href", m.group())
|
|
| 168 |
} |
|
| 169 |
writer.writeStartElement(outputElement) |
|
| 170 |
writer.writeAttribute(outputAttribute, outputAttValue) |
|
| 171 |
writer.writeAttribute("frpos", "NAM")
|
|
| 172 |
writer.writeAttribute("frlemma", m.group())
|
|
| 173 |
writer.writeCharacters(m.group()) |
|
| 174 |
writer.writeEndElement() |
|
| 175 |
if (outputAttribute == "type" && outputAttValue == "url") {
|
|
| 176 |
writer.writeEndElement() |
|
| 177 |
} |
|
| 178 |
lastMatch = m.end() |
|
| 179 |
} |
|
| 180 |
tokenizeAs(source.substring(lastMatch), matchers) |
|
| 181 |
} else {
|
|
| 182 |
writer.writeCharacters(source) |
|
| 183 |
} |
|
| 184 |
} |
|
| 185 |
|
|
| 186 |
while (reader.readRecord()) {
|
|
| 187 |
|
|
| 188 |
writer.writeCharacters(" ")
|
|
| 189 |
writer.writeEmptyElement("pb") // <pb/>
|
|
| 190 |
writer.writeAttribute("n", ""+pb_n++)
|
|
| 191 |
writer.writeCharacters("\n")
|
|
| 192 |
|
|
| 193 |
writer.writeCharacters(" ")
|
|
| 194 |
writer.writeStartElement(textTag) |
|
| 195 |
|
|
| 196 |
for (String m : metadataColumnList) {
|
|
| 197 |
m = m.trim() |
|
| 198 |
writer.writeAttribute(m, CleanFile.clean(reader.get(m))) |
|
| 199 |
} |
|
| 200 |
|
|
| 201 |
writer.writeCharacters("\n")
|
|
| 202 |
|
|
| 203 |
for (String t : textColumnList) {
|
|
| 204 |
t = t.trim() |
|
| 205 |
|
|
| 206 |
writer.writeCharacters(" ")
|
|
| 207 |
writer.writeStartElement(t) |
|
| 208 |
|
|
| 209 |
// look for re-tweets |
|
| 210 |
def c = reader.get(t) |
|
| 211 |
def retweet = false |
|
| 212 |
def rt_pattern = ~/(?s:^RT @[^:]+: (.*)$)/ |
|
| 213 |
def rt_m = (c =~ rt_pattern) |
|
| 214 |
if (rt_m) retweet = true |
|
| 215 |
if (retweet) { // it is a re-tweet
|
|
| 216 |
// take off and count all re-tweet prefixes |
|
| 217 |
def nPrefix = 0 |
|
| 218 |
while (retweet) {
|
|
| 219 |
c = rt_m.group(1) |
|
| 220 |
rt_m = (c =~ rt_pattern) |
|
| 221 |
if (rt_m) { retweet = true } else { retweet = false }
|
|
| 222 |
nPrefix++ |
|
| 223 |
} |
|
| 224 |
def crt = sprintf("RT-%d ", nPrefix)+c
|
|
| 225 |
def n = tweet.get(crt) |
|
| 226 |
if (n) {
|
|
| 227 |
tweet.put(crt, n+1) |
|
| 228 |
} else {
|
|
| 229 |
tweet.put(crt, 1) |
|
| 230 |
} |
|
| 231 |
} else {
|
|
| 232 |
tweet.put(c, 1) |
|
| 233 |
tokenizeAs(CleanFile.clean(c),\ |
|
| 234 |
[/@\p{L}+/, "w", "type", "attag",\
|
|
| 235 |
/#\p{L}+/, "w", "type", "hashtag",\
|
|
| 236 |
/((http|ftp|https):\/\/[\p{L}\-_]+(\.[\p{L}\-_]+)+([\p{L}\-\.,@?^=%&:\/~\+#]*[\p{L}\-\@?^=%&\/~\+#])?)/, "w", "type", "url"])
|
|
| 237 |
} |
|
| 238 |
writer.writeEndElement() // t |
|
| 239 |
writer.writeCharacters("\n")
|
|
| 240 |
} |
|
| 241 |
|
|
| 242 |
writer.writeCharacters(" ")
|
|
| 243 |
writer.writeEndElement() // textTag |
|
| 244 |
writer.writeCharacters("\n")
|
|
| 245 |
} |
|
| 246 |
|
|
| 247 |
writer.writeEndElement() // rootTag |
|
| 248 |
writer.writeCharacters("\n")
|
|
| 249 |
writer.close() |
|
| 250 |
output.close() |
|
| 251 |
reader.close() |
|
| 252 |
|
|
| 253 |
def nt = tweet.size() |
|
| 254 |
println sprintf("\n%d tweets uniques sur %d lus", nt, pb_n-1)
|
|
| 255 |
|
|
| 256 |
if (nt > 50) {
|
|
| 257 |
nt = 50 |
|
| 258 |
println "Printing first 50 most frequent [re-]tweets:" |
|
| 259 |
} |
|
| 260 |
// take(nt). |
|
| 261 |
tweet.sort { -it.value }.each { entry ->
|
|
| 262 |
def l = entry.key.length() |
|
| 263 |
if (l > 0) {
|
|
| 264 |
// if (entry.key.find("On n'a pas peur !!")) {
|
|
| 265 |
println sprintf("[%d] %s...", entry.value, entry.key[0..(Math.min(entry.key.length(), 100)-1)])
|
|
| 266 |
// } |
|
| 267 |
} else {
|
|
| 268 |
println sprintf("Empty tweet content occurring %d times.", entry.value)
|
|
| 269 |
} |
|
| 270 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/MultiLineSearchReplaceInDirectoryMacro.groovy (revision 3950) | ||
|---|---|---|
| 1 |
package org.txm.macro.txt |
|
| 2 |
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
| 3 |
// @author sheiden |
|
| 4 |
// |
|
| 5 |
// This file is part of the TXM platform. |
|
| 6 |
// |
|
| 7 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
| 8 |
// it under the terms of the GNU General Public License as published by |
|
| 9 |
// the Free Software Foundation, either version 3 of the License, or |
|
| 10 |
// (at your option) any later version. |
|
| 11 |
// |
|
| 12 |
// The TXM platform is distributed in the hope that it will be useful, |
|
| 13 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 14 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 15 |
// GNU General Public License for more details. |
|
| 16 |
// |
|
| 17 |
// You should have received a copy of the GNU General Public License |
|
| 18 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
| 19 |
// |
|
| 20 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
| 21 |
// $LastChangedRevision: 2378 $ |
|
| 22 |
// $LastChangedBy: sheiden $ |
|
| 23 |
// |
|
| 24 |
|
|
| 25 |
// STANDARD DECLARATIONS |
|
| 26 |
|
|
| 27 |
// imports |
|
| 28 |
import org.kohsuke.args4j.* |
|
| 29 |
import groovy.transform.Field |
|
| 30 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 31 |
|
|
| 32 |
// PARAMETERS |
|
| 33 |
|
|
| 34 |
// **change this parameter** |
|
| 35 |
@Field @Option(name="inputDirectory", usage="Dossier contenant les fichiers à modifier", widget="Folder", required=true, def='/home') |
|
| 36 |
inputDirectory = new File(System.getProperty("user.home"), "Bureau/voeux")
|
|
| 37 |
|
|
| 38 |
// **change this parameter** |
|
| 39 |
@Field @Option(name="extension", usage="extension des fichiers à modifier (expression régulière)", widget="String", required=true, def='\\.txt') |
|
| 40 |
extension = "\\.txt" |
|
| 41 |
|
|
| 42 |
// **change this parameter** |
|
| 43 |
@Field @Option(name="find", usage="Expression régulière à chercher", widget="String", required=true, def='’') |
|
| 44 |
find = "’" |
|
| 45 |
|
|
| 46 |
// **change this parameter** |
|
| 47 |
@Field @Option(name="replaceWith", usage="Chaîne de remplacement", widget="String", required=false, def='\'') |
|
| 48 |
replaceWith = "'" |
|
| 49 |
|
|
| 50 |
// **change this parameter** |
|
| 51 |
@Field @Option(name="encoding", usage="Encodage des caractères des fichiers", widget="String", required=true, def='UTF-8') |
|
| 52 |
encoding = "utf-8" |
|
| 53 |
|
|
| 54 |
// PARAMETERS DIALOG |
|
| 55 |
|
|
| 56 |
if (!ParametersDialog.open(this)) return; |
|
| 57 |
|
|
| 58 |
// SANITY CHECK |
|
| 59 |
|
|
| 60 |
if (inputDirectory==null || find==null || replaceWith==null) { println "** MultiLineSearchReplaceInDirectoryMacro: the input directory, the regular expression to search and replacement must be specified."; return}
|
|
| 61 |
|
|
| 62 |
if (!inputDirectory.exists()) { println "** MultiLineSearchReplaceInDirectoryMacro: impossible to access the '$inputDirectory' input directory."; return}
|
|
| 63 |
|
|
| 64 |
// MAIN BODY |
|
| 65 |
|
|
| 66 |
println "-- working in $inputDirectory directory with files of '$extension' extension" |
|
| 67 |
println "-- replacing '$find' with '$replaceWith'" |
|
| 68 |
|
|
| 69 |
// find = /date="([0-9]+)-([0-9]+-[0-9]+)"/ |
|
| 70 |
// **change this parameter** |
|
| 71 |
// replaceWith = 'date="$1-$2" year="$1"' |
|
| 72 |
// **change this parameter** (warning: '$1', '$2'... can be interpreted by Groovy in "..." strings) |
|
| 73 |
// |
|
| 74 |
// RECETTE |
|
| 75 |
// <tei:pb [^>]+>\n <tei:p>[0-9]+</tei:p>\n <tei:p>[^ ]+ 18[5-6][0-9]\.</tei:p>\n |
|
| 76 |
|
|
| 77 |
def p = /$find/ |
|
| 78 |
|
|
| 79 |
inputDirectory.eachFileMatch(~/.*$extension/) { file -> // for each file matching extension
|
|
| 80 |
println "\n-- processing: "+file.getName() |
|
| 81 |
def tmp = File.createTempFile("SearchReplaceInDirectoryTemp", ".tmp", file.getParentFile()) // create temporary file
|
|
| 82 |
tmp.withWriter(encoding) { writer ->
|
|
| 83 |
writer.print(file.getText(encoding).replaceAll(p, replaceWith)) |
|
| 84 |
} |
|
| 85 |
file.delete() |
|
| 86 |
tmp.renameTo(file) |
|
| 87 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/Taltac2XMLMacro.groovy (revision 3950) | ||
|---|---|---|
| 1 |
package org.txm.macro.line |
|
| 2 |
// Copyright © 2018 - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
| 3 |
// @author sheiden |
|
| 4 |
// |
|
| 5 |
// This file is part of the TXM platform. |
|
| 6 |
// |
|
| 7 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
| 8 |
// it under the terms of the GNU General Public License as published by |
|
| 9 |
// the Free Software Foundation, either version 3 of the License, or |
|
| 10 |
// (at your option) any later version. |
|
| 11 |
// |
|
| 12 |
// The TXM platform is distributed in the hope that it will be useful, |
|
| 13 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 14 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 15 |
// GNU General Public License for more details. |
|
| 16 |
// |
|
| 17 |
// You should have received a copy of the GNU General Public License |
|
| 18 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
| 19 |
// |
|
| 20 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
| 21 |
// $LastChangedRevision: 2378 $ |
|
| 22 |
// $LastChangedBy: sheiden $ |
|
| 23 |
// |
|
| 24 |
|
|
| 25 |
// imports |
|
| 26 |
import org.kohsuke.args4j.* |
|
| 27 |
import groovy.transform.Field |
|
| 28 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 29 |
|
|
| 30 |
// parameters |
|
| 31 |
|
|
| 32 |
// **change this parameter** |
|
| 33 |
@Field @Option(name="inputFile",usage="fichier Taltac", widget="File", required=true, def='') |
|
| 34 |
def inputFile |
|
| 35 |
|
|
| 36 |
// **change this parameter** |
|
| 37 |
@Field @Option(name="characterEncoding",usage="Système d'encodage des caractères utilisé pour le fichier", widget="String", required=true, def='UTF-8') |
|
| 38 |
def characterEncoding = "UTF-8" |
|
| 39 |
|
|
| 40 |
@Field @Option(name="titleTag", usage="name of title ++++ tag", widget="String", required=false, def="titolo") |
|
| 41 |
def titleTag |
|
| 42 |
|
|
| 43 |
@Field @Option(name="addParagraphs", usage="encode each text line as a paragraph", widget="Boolean", required=true, def="true") |
|
| 44 |
def addParagraphs |
|
| 45 |
|
|
| 46 |
if (!ParametersDialog.open(this)) return |
|
| 47 |
|
|
| 48 |
name = inputFile.getName() |
|
| 49 |
idx = name.lastIndexOf(".")
|
|
| 50 |
if (idx > 0) name = name.substring(0, idx) |
|
| 51 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
| 52 |
|
|
| 53 |
pageNum = 2 |
|
| 54 |
outputFile.write('') // create empty file
|
|
| 55 |
|
|
| 56 |
outputFile.withWriter() { writer ->
|
|
| 57 |
|
|
| 58 |
writer.println '<?xml version="1.0" encoding="UTF-8"?>\n<taltac>' |
|
| 59 |
|
|
| 60 |
def firstDoc = true |
|
| 61 |
def inTitle = false |
|
| 62 |
def currentTag = "" |
|
| 63 |
def metadata = "<hi>no metadata</hi>" |
|
| 64 |
def metadataEdited = false |
|
| 65 |
|
|
| 66 |
inputFile.eachLine(characterEncoding) { line ->
|
|
| 67 |
|
|
| 68 |
line = line.replaceAll("&", "&")
|
|
| 69 |
line = line.replaceAll("<", "<")
|
|
| 70 |
|
|
| 71 |
// ****yahoobanque1 *data=31gen *autore=da *rubrica=da *ora=08 *agenzia=reuters *grafici=da |
|
| 72 |
group = (line =~ /^\*\*\*\*([^ ]+) (.*)/) |
|
| 73 |
if (group) {
|
|
| 74 |
|
|
| 75 |
ident = group[0][1] |
|
| 76 |
properties = group[0][2] |
|
| 77 |
metadata=properties.replaceAll(/ ?\*([^=]+)=([^ ]+)/) { all2, propname, propvalue ->
|
|
| 78 |
"<item><hi>$propname</hi>: $propvalue</item>\n" |
|
| 79 |
} |
|
| 80 |
metadata="<list type=\"unordered\">\n"+metadata+"</list>" |
|
| 81 |
metadataEdited = false |
|
| 82 |
properties=properties.replaceAll(/\*([^=]+)=([^ ]+)/) { all2, propname, propvalue ->
|
|
| 83 |
"$propname=\"$propvalue\"" |
|
| 84 |
} |
|
| 85 |
if (firstDoc) {
|
|
| 86 |
firstDoc = false |
|
| 87 |
writer.println "<doc ident=\"$ident\" $properties>" |
|
| 88 |
}else{
|
|
| 89 |
if (currentTag.size() > 0) {
|
|
| 90 |
if (titleTag.size() > 0) {
|
|
| 91 |
if (inTitle) {
|
|
| 92 |
writer.println sprintf("</%s>", currentTag)
|
|
| 93 |
writer.println "</head>" |
|
| 94 |
inTitle = false |
|
| 95 |
currentTag = "" |
|
| 96 |
}else{
|
|
| 97 |
writer.println sprintf("</%s>", currentTag)
|
|
| 98 |
currentTag = "" |
|
| 99 |
} |
|
| 100 |
}else{
|
|
| 101 |
writer.println sprintf("</%s>", currentTag)
|
|
| 102 |
currentTag = "" |
|
| 103 |
} |
|
| 104 |
/* |
|
| 105 |
if (!metadataEdited) {
|
|
| 106 |
println sprintf ("ident = %s, currentTag = %s (%d), metadata", ident, currentTag, currentTag.size())
|
|
| 107 |
writer.println metadata |
|
| 108 |
metadataEdited = true |
|
| 109 |
} |
|
| 110 |
*/ |
|
| 111 |
} |
|
| 112 |
writer.println "</doc>\n<pb n=\"${pageNum++}\"/>\n<doc ident=\"$ident\" $properties>"
|
|
| 113 |
} |
|
| 114 |
}else{
|
|
| 115 |
|
|
| 116 |
// ++++titolo |
|
| 117 |
group = (line =~ /^\+\+\+\+(.+)/) |
|
| 118 |
if (group) {
|
|
| 119 |
|
|
| 120 |
tag = group[0][1] |
|
| 121 |
|
|
| 122 |
if (currentTag.size() > 0) {
|
|
| 123 |
if (titleTag.size() > 0) {
|
|
| 124 |
if (inTitle) {
|
|
| 125 |
writer.println sprintf("</%s>", currentTag)
|
|
| 126 |
writer.println "</head>" |
|
| 127 |
inTitle = false |
|
| 128 |
currentTag = "" |
|
| 129 |
}else{
|
|
| 130 |
writer.println sprintf("</%s>", currentTag)
|
|
| 131 |
currentTag = "" |
|
| 132 |
} |
|
| 133 |
}else{
|
|
| 134 |
writer.println sprintf("</%s>", currentTag)
|
|
| 135 |
currentTag = "" |
|
| 136 |
} |
|
| 137 |
if (!metadataEdited) {
|
|
| 138 |
writer.println metadata |
|
| 139 |
metadataEdited = true |
|
| 140 |
} |
|
| 141 |
} |
|
| 142 |
if (titleTag.size() > 0) {
|
|
| 143 |
if (tag == titleTag) {
|
|
| 144 |
writer.println "<head>" |
|
| 145 |
writer.println sprintf("<%s>", tag)
|
|
| 146 |
inTitle = true |
|
| 147 |
currentTag = tag |
|
| 148 |
}else{
|
|
| 149 |
writer.println sprintf("<%s>", tag)
|
|
| 150 |
currentTag = tag |
|
| 151 |
} |
|
| 152 |
}else{
|
|
| 153 |
writer.println sprintf("<%s>", tag)
|
|
| 154 |
currentTag = tag |
|
| 155 |
} |
|
| 156 |
|
|
| 157 |
}else{
|
|
| 158 |
|
|
| 159 |
// normal line |
|
| 160 |
if (addParagraphs) {
|
|
| 161 |
writer.println "<p>$line</p>" |
|
| 162 |
}else{
|
|
| 163 |
writer.println "$line" |
|
| 164 |
} |
|
| 165 |
} |
|
| 166 |
} |
|
| 167 |
} |
|
| 168 |
|
|
| 169 |
writer.println sprintf("</%s>\n</doc>\n</taltac>\n", currentTag)
|
|
| 170 |
writer.close() |
|
| 171 |
} |
|
| 172 |
|
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/FindMultiLineRegExpMacro.groovy (revision 3950) | ||
|---|---|---|
| 1 |
// Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
| 2 |
// @author sheiden |
|
| 3 |
// |
|
| 4 |
// This file is part of the TXM platform. |
|
| 5 |
// |
|
| 6 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
| 7 |
// it under the terms of the GNU General Public License as published by |
|
| 8 |
// the Free Software Foundation, either version 3 of the License, or |
|
| 9 |
// (at your option) any later version. |
|
| 10 |
// |
|
| 11 |
// The TXM platform is distributed in the hope that it will be useful, |
|
| 12 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 13 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 14 |
// GNU General Public License for more details. |
|
| 15 |
// |
|
| 16 |
// You should have received a copy of the GNU General Public License |
|
| 17 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
| 18 |
// |
|
| 19 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
| 20 |
// $LastChangedRevision: 2378 $ |
|
| 21 |
// $LastChangedBy: sheiden $ |
|
| 22 |
// |
|
| 23 |
|
|
| 24 |
// STANDARD DECLARATIONS |
|
| 25 |
|
|
| 26 |
import org.kohsuke.args4j.* |
|
| 27 |
|
|
| 28 |
import groovy.transform.Field |
|
| 29 |
|
|
| 30 |
import java.nio.charset.Charset |
|
| 31 |
|
|
| 32 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 33 |
import org.txm.utils.* |
|
| 34 |
|
|
| 35 |
import javax.xml.stream.* |
|
| 36 |
|
|
| 37 |
import java.net.URL |
|
| 38 |
import java.util.regex.Matcher; |
|
| 39 |
import java.util.regex.Pattern; |
|
| 40 |
|
|
| 41 |
// PARAMETERS |
|
| 42 |
|
|
| 43 |
@Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=false, def="") |
|
| 44 |
File inputDirectory; |
|
| 45 |
@Field @Option(name="regexp", usage="Regular expression to match", widget="String", required=false, def="") |
|
| 46 |
String regexp; |
|
| 47 |
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8") |
|
| 48 |
String encoding; |
|
| 49 |
|
|
| 50 |
// PARAMETERS |
|
| 51 |
|
|
| 52 |
if (inputDirectory == null) {
|
|
| 53 |
if (!ParametersDialog.open(this)) return; |
|
| 54 |
} |
|
| 55 |
|
|
| 56 |
// SANITY CHECK |
|
| 57 |
|
|
| 58 |
if (inputDirectory==null || regexp==null) { println "** FindMultiLineRegExpMacro: the input directory and the regular expression must be specified."; return}
|
|
| 59 |
|
|
| 60 |
if (!inputDirectory.exists()) { println "** FindMultiLineRegExpMacro: impossible to access the '$inputDirectory' input directory."; return}
|
|
| 61 |
|
|
| 62 |
// MAIN BODY |
|
| 63 |
|
|
| 64 |
def p = /$regexp/ |
|
| 65 |
|
|
| 66 |
println "-- looking for: $regexp" |
|
| 67 |
|
|
| 68 |
for (File inputfile : inputDirectory.listFiles()) {
|
|
| 69 |
|
|
| 70 |
if (inputfile.isDirectory()) continue // ignore |
|
| 71 |
|
|
| 72 |
def matcher = inputfile.getText() =~ p |
|
| 73 |
|
|
| 74 |
println "\n-- file:"+inputfile.getName() + " "+matcher.size() + " match" +((matcher.size() > 1)?"s":"") |
|
| 75 |
|
|
| 76 |
if (matcher.size() > 0) {
|
|
| 77 |
matcher.each { println it }
|
|
| 78 |
} |
|
| 79 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/FindRegExpMacro.groovy (revision 3950) | ||
|---|---|---|
| 1 |
package org.txm.macro |
|
| 2 |
|
|
| 3 |
import org.kohsuke.args4j.* |
|
| 4 |
|
|
| 5 |
import groovy.transform.Field |
|
| 6 |
|
|
| 7 |
import java.nio.charset.Charset |
|
| 8 |
|
|
| 9 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 10 |
import org.txm.utils.* |
|
| 11 |
|
|
| 12 |
import javax.xml.stream.* |
|
| 13 |
|
|
| 14 |
import java.net.URL |
|
| 15 |
import java.util.regex.Matcher; |
|
| 16 |
import java.util.regex.Pattern; |
|
| 17 |
|
|
| 18 |
@Field @Option(name="inputDirectory",usage="TXT directory", widget="Folder", required=false, def="directory path") |
|
| 19 |
File inputDirectory; |
|
| 20 |
@Field @Option(name="regexp",usage="Regular expression to match", widget="String", required=false, def="regular expression") |
|
| 21 |
String regexp; |
|
| 22 |
@Field @Option(name="encoding",usage="File encoding", widget="String", required=false, def="UTF-8") |
|
| 23 |
String encoding; |
|
| 24 |
|
|
| 25 |
if (!ParametersDialog.open(this)) return; |
|
| 26 |
|
|
| 27 |
def p = /$regexp/ |
|
| 28 |
|
|
| 29 |
println "processing: "+inputDirectory.listFiles() |
|
| 30 |
for (File inputfile : inputDirectory.listFiles()) {
|
|
| 31 |
if (inputfile.isDirectory()) continue // ignore |
|
| 32 |
|
|
| 33 |
def lines = [] |
|
| 34 |
inputfile.eachLine("UTF-8") { line, n ->
|
|
| 35 |
def m = line =~ p |
|
| 36 |
if ( m.size() > 0) {
|
|
| 37 |
lines << " line $n: $line" |
|
| 38 |
} |
|
| 39 |
} |
|
| 40 |
|
|
| 41 |
println inputfile.getName() + " "+lines.size() + " match" +((lines.size() > 1)?"s":"") |
|
| 42 |
if (lines.size() > 0) {
|
|
| 43 |
for (String s : lines ) println s |
|
| 44 |
} |
|
| 45 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/PennTreebank2TIGERMacro.groovy (revision 3950) | ||
|---|---|---|
| 1 |
// STANDARD DECLARATIONS |
|
| 2 |
package org.txm.macro |
|
| 3 |
|
|
| 4 |
import org.kohsuke.args4j.* |
|
| 5 |
import groovy.transform.Field |
|
| 6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
| 7 |
|
|
| 8 |
import ims.tiger.importfilter.* |
|
| 9 |
import ims.tiger.importfilter.bracketing.* |
|
| 10 |
import org.apache.log4j.* |
|
| 11 |
|
|
| 12 |
// BEGINNING OF PARAMETERS |
|
| 13 |
|
|
| 14 |
// Declare each parameter here |
|
| 15 |
// (available widget types: Query, File, Folder, String, Text, Boolean, Integer, Float and Date) |
|
| 16 |
|
|
| 17 |
@Field @Option(name="SourceFile", usage="SourceFilename", widget="File", required=false, def="input.mrg") |
|
| 18 |
def SourceFile |
|
| 19 |
|
|
| 20 |
@Field @Option(name="TargetFile", usage="TargetFilename", widget="File", required=false, def="output.xml") |
|
| 21 |
def TargetFile |
|
| 22 |
|
|
| 23 |
@Field @Option(name="XMLTargetID", usage="XMLTargetID", widget="String", required=false, def="XMLTargetID") |
|
| 24 |
def XMLTargetID |
|
| 25 |
|
|
| 26 |
@Field @Option(name="SourceFolder", usage="SourceFolder (leave empty if SourceFile set)", widget="Folder", required=false, def="input <remove if SourceFile set>") |
|
| 27 |
def SourceFolder |
|
| 28 |
|
|
| 29 |
@Field @Option(name="TargetFolder", usage="TargetFolder (leave empty if TargetFile set)", widget="Folder", required=false, def="output") |
|
| 30 |
def TargetFolder |
|
| 31 |
|
|
| 32 |
@Field @Option(name="MaximumNumberOfSentences", usage="MaximumNumberOfSentences", widget="Integer", required=true, def="0") |
|
| 33 |
def MaximumNumberOfSentences |
|
| 34 |
|
|
| 35 |
// Open the parameters input dialog box |
|
| 36 |
if (!ParametersDialog.open(this)) return |
|
| 37 |
|
|
| 38 |
// END OF PARAMETERS |
|
| 39 |
|
|
| 40 |
BasicConfigurator.configure() |
|
| 41 |
|
|
| 42 |
ImportFilter filter = new MyUPennFilter() |
|
| 43 |
ImportFilterHandler handler = new SilentImportFilterHandler() |
|
| 44 |
filter.setImportFilterHandler(handler) |
|
| 45 |
|
|
| 46 |
filter.setCompression(false) |
|
| 47 |
filter.setSchemaFilename(System.getProperty("user.home")+"/TXM/xml/xsd/tiger/TigerXML.xsd")
|
|
| 48 |
filter.setMaximumNumberOfSentences(MaximumNumberOfSentences-1) |
|
| 49 |
|
|
| 50 |
if (SourceFolder == null) {
|
|
| 51 |
|
|
| 52 |
def SourceFilename = SourceFile.getAbsolutePath() |
|
| 53 |
def TargetFilename = TargetFile.getAbsolutePath() |
|
| 54 |
|
|
| 55 |
filter.setSourceFilename(SourceFilename) |
|
| 56 |
filter.setXMLTargetFilename(TargetFilename) |
|
| 57 |
filter.setXMLTargetID(XMLTargetID) |
|
| 58 |
println "Converting '$SourceFilename' to '$TargetFilename'" |
|
| 59 |
|
|
| 60 |
try {
|
|
| 61 |
filter.startConversion() |
|
| 62 |
} catch (Exception e) |
|
| 63 |
{ e.printStackTrace()
|
|
| 64 |
} |
|
| 65 |
} else {
|
|
| 66 |
|
|
| 67 |
for (File inputfile : SourceFolder.listFiles()) {
|
|
| 68 |
if (inputfile.isDirectory()) continue // ignore |
|
| 69 |
|
|
| 70 |
fileNameE = inputfile.getName() |
|
| 71 |
i = fileNameE.lastIndexOf('.')
|
|
| 72 |
if (i > 0) extension = fileNameE.substring(i+1) else extension = "" |
|
| 73 |
fileName = fileNameE.substring(0, i) |
|
| 74 |
fileId = fileName |
|
| 75 |
input = inputfile.getAbsolutePath() |
|
| 76 |
output = TargetFolder.getAbsolutePath()+"/"+fileName+".xml" |
|
| 77 |
println "Converting '"+input+"' to '"+output+"'" |
|
| 78 |
filter.setSourceFilename(input) |
|
| 79 |
filter.setXMLTargetFilename(output) |
|
| 80 |
filter.setXMLTargetID(fileId) |
|
| 81 |
|
|
| 82 |
try {
|
|
| 83 |
filter.startConversion() |
|
| 84 |
} catch (Exception e) |
|
| 85 |
{ e.printStackTrace()
|
|
| 86 |
} |
|
| 87 |
} |
|
| 88 |
} |
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/PennTreebank2TIGERMacro.readme (revision 3950) | ||
|---|---|---|
| 1 |
PennTreebank2TIGER macro |
|
| 2 |
======================== |
|
| 3 |
2016-07, sheiden |
|
| 4 |
|
|
| 5 |
* Install |
|
| 6 |
|
|
| 7 |
- unzip PennTreebank2TIGER.zip |
|
| 8 |
- copy TigerXML.xsd in $HOME/TXM/xml/xsd/tiger/TigerXML.xsd |
|
| 9 |
- copy PennTreebank2TIGERMacro.groovy in $HOME/TXM/scripts/macro/org/txm/macro |
|
| 10 |
- refresh macro view |
|
| 11 |
|
|
| 12 |
* Parameters |
|
| 13 |
|
|
| 14 |
- SourceFile: drives one file conversion process mode |
|
| 15 |
- SourceFolder: drives several files in a folder conversion process mode |
|
| 16 |
-> fill only the field to be used |
|
| 17 |
- TargetFile: if SourceFile then use that target file |
|
| 18 |
- XMLTargetID: if SourceFile then use that target ID |
|
| 19 |
- TargetFolder: : if SourceFolder then use that target folder to save targets |
|
| 20 |
- target files are named from the source filename with the extension stripped and '.xml' added |
|
| 21 |
- target ID are named from the source filename with the extension stripped |
|
| 22 |
- MaximumNumberOfSentences: maximum number of sentences to convert per file (set to '0' to convert all sentences) |
|
| 23 |
|
|
| TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/TigerXML.xsd (revision 3950) | ||
|---|---|---|
| 1 |
<?xml version="1.0"?> |
|
| 2 |
|
|
| 3 |
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> |
|
| 4 |
|
|
| 5 |
<!-- ================================================================== |
|
| 6 |
XML Schema for the TIGER-XML format |
|
| 7 |
http://www.ims.uni-stuttgart.de/projekte/TIGER/public/TigerXML.xsd |
|
| 8 |
================================================================== |
|
| 9 |
TIGER Project, Wolfgang Lezius |
|
| 10 |
IMS, University of Stuttgart, 04/01/2003 |
|
| 11 |
================================================================== --> |
|
| 12 |
|
|
| 13 |
|
|
| 14 |
<!-- ====================================================== |
|
| 15 |
INCLUDES DECLARATION OF THE HEADER |
|
| 16 |
====================================================== --> |
|
| 17 |
<xsd:include schemaLocation="TigerXMLHeader.xsd" /> |
|
| 18 |
|
|
| 19 |
|
|
| 20 |
<!-- ====================================================== |
|
| 21 |
INCLUDES DECLARATION OF SUBCORPORA AND SENTENCES |
|
| 22 |
====================================================== --> |
|
| 23 |
<xsd:include schemaLocation="TigerXMLSubcorpus.xsd" /> |
|
| 24 |
|
|
| 25 |
|
|
| 26 |
<!-- ====================================================== |
|
| 27 |
DECLARATION OF THE CORPUS DOCUMENT |
|
| 28 |
====================================================== --> |
|
| 29 |
|
|
| 30 |
<!-- declaration of the root element: corpus --> |
|
| 31 |
|
|
| 32 |
<xsd:element name="corpus"> |
|
| 33 |
|
|
| 34 |
<xsd:complexType> |
|
| 35 |
|
|
| 36 |
<xsd:sequence> |
|
| 37 |
|
|
| 38 |
<xsd:choice> |
|
| 39 |
<!-- header of the document is optional --> |
|
| 40 |
<xsd:element name="head" type="headType" minOccurs="0" maxOccurs="1"/> |
|
| 41 |
</xsd:choice> |
|
| 42 |
|
|
| 43 |
<xsd:element name="body" type="bodyType" minOccurs="1" maxOccurs="1"/> |
|
| 44 |
|
|
| 45 |
</xsd:sequence> |
|
| 46 |
|
|
| 47 |
<!-- corpus ID --> |
|
| 48 |
<xsd:attribute name="id" type="idType" use="required" /> |
|
| 49 |
|
|
| 50 |
<!-- optional attribute: TigerXML version; used by TIGERSearch only --> |
|
| 51 |
<xsd:attribute name="version" type="xsd:string" use="optional" /> |
|
| 52 |
|
|
| 53 |
</xsd:complexType> |
|
| 54 |
|
|
| 55 |
</xsd:element> |
|
| 56 |
|
|
| 57 |
|
|
| 58 |
<!-- declaration of the body type --> |
|
| 59 |
|
|
| 60 |
<xsd:complexType name="bodyType"> |
|
| 61 |
|
|
| 62 |
<xsd:choice minOccurs="1" maxOccurs="unbounded"> |
|
| 63 |
<xsd:element name="subcorpus" type="subcorpusType" minOccurs="1" maxOccurs="1"/> |
|
| 64 |
<xsd:element name="s" type="sentenceType" minOccurs="1" maxOccurs="1"/> |
|
| 65 |
</xsd:choice> |
|
| 66 |
|
|
| 67 |
</xsd:complexType> |
|
| 68 |
|
|
| 69 |
|
|
| 70 |
</xsd:schema> |
|
| 0 | 71 | |
Formats disponibles : Unified diff