Révision 3950
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/csv/CSV2XMLTweetAwareMacro.groovy (revision 3950) | ||
---|---|---|
1 |
// Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
2 |
// @author sheiden |
|
3 |
// |
|
4 |
// This file is part of the TXM platform. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
7 |
// it under the terms of the GNU General Public License as published by |
|
8 |
// the Free Software Foundation, either version 3 of the License, or |
|
9 |
// (at your option) any later version. |
|
10 |
// |
|
11 |
// The TXM platform is distributed in the hope that it will be useful, |
|
12 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
// GNU General Public License for more details. |
|
15 |
// |
|
16 |
// You should have received a copy of the GNU General Public License |
|
17 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
18 |
// |
|
19 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
20 |
// $LastChangedRevision: 2378 $ |
|
21 |
// $LastChangedBy: sheiden $ |
|
22 |
// |
|
23 |
|
|
24 |
package org.txm.macro.csv |
|
25 |
|
|
26 |
import org.kohsuke.args4j.* |
|
27 |
import groovy.transform.Field |
|
28 |
import java.nio.charset.Charset |
|
29 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
30 |
import org.txm.utils.* |
|
31 |
import org.txm.importer.* |
|
32 |
import javax.xml.stream.* |
|
33 |
import java.net.URL |
|
34 |
//import org.apache.xerces.util |
|
35 |
|
|
36 |
@Field @Option(name="inputFile",usage="CSV File", widget="File", required=false, def="/home/sheiden/Corpus/src/charlie/csv/fr_07jan.csv") |
|
37 |
File inputFile; |
|
38 |
|
|
39 |
@Field @Option(name="columnSeparator",usage="column columnSeparator", widget="String", required=false, def=",") |
|
40 |
def columnSeparator; |
|
41 |
|
|
42 |
@Field @Option(name="characterEncoding",usage="File characterEncoding", widget="String", required=false, def="UTF-8") |
|
43 |
def characterEncoding; |
|
44 |
|
|
45 |
@Field @Option(name="rootTag",usage="Root tag name", widget="String", required=false, def="tweets") |
|
46 |
def rootTag; |
|
47 |
|
|
48 |
@Field @Option(name="textTag",usage="Text tag name", widget="String", required=false, def="tweet") |
|
49 |
def textTag; |
|
50 |
|
|
51 |
@Field @Option(name="metadataColumnList",usage="metadataColumnList column list separated with comma", widget="String", required=false, def="tweet-id-str,job-id,created-at,from-user,from-user-id-str,from-user-name,from-user-fullname,from-user-followers,from-user-following,from-user-favorites,from-user-tweets,from-user-timezone,to-user,to-user-id-str,to-user-name,source,location-geo,location-geo-0,location-geo-1,iso-language,analysis-state") |
|
52 |
def metadataColumnList; |
|
53 |
|
|
54 |
@Field @Option(name="textColumnList",usage="textColumnList column list separated with comma", widget="String", required=false, def="text") |
|
55 |
def textColumnList; |
|
56 |
|
|
57 |
if (!ParametersDialog.open(this)) return; |
|
58 |
|
|
59 |
metadataColumnList = metadataColumnList.split(",") |
|
60 |
textColumnList = textColumnList.split(",") |
|
61 |
textTag = textTag.trim() |
|
62 |
rootTag = rootTag.trim() |
|
63 |
|
|
64 |
CsvReader reader = new CsvReader(inputFile.getAbsolutePath(), columnSeparator.charAt(0), Charset.forName(characterEncoding)); |
|
65 |
if (!reader.readHeaders()) { |
|
66 |
println "Error: no header" |
|
67 |
return |
|
68 |
} |
|
69 |
|
|
70 |
def headers = Arrays.asList(reader.getHeaders()) |
|
71 |
headers.eachWithIndex {str, index -> headers[index] = str.replaceAll(/_/, "-") } |
|
72 |
reader.setHeaders(headers as String[]) |
|
73 |
|
|
74 |
println "Root tag: $rootTag" |
|
75 |
println "Text tag: $textTag" |
|
76 |
println "Metadata column(s): $metadataColumnList" |
|
77 |
println "Text column(s): $textColumnList" |
|
78 |
println "Processing..." |
|
79 |
|
|
80 |
def ok = true |
|
81 |
for (String m : metadataColumnList) { |
|
82 |
m = m.trim() |
|
83 |
if (!headers.contains(m)) { |
|
84 |
println "Error: missing metadata column named '$m'" |
|
85 |
ok = false |
|
86 |
} |
|
87 |
} |
|
88 |
for (String t : textColumnList) { |
|
89 |
t = t.trim() |
|
90 |
if (!headers.contains(t)) { |
|
91 |
println "Error: missing text column named '$t'" |
|
92 |
ok = false |
|
93 |
} |
|
94 |
} |
|
95 |
|
|
96 |
if (!ok) { |
|
97 |
println "** CSV2XML: Column(s) missing, aborting..." |
|
98 |
return |
|
99 |
} |
|
100 |
|
|
101 |
String name = inputFile.getName() |
|
102 |
int idx = name.indexOf(".") |
|
103 |
if (idx > 0) name = name.substring(0, idx) |
|
104 |
File outputfile = new File(inputFile.getParentFile(), name+".xml") |
|
105 |
|
|
106 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
107 |
Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8") |
|
108 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output) |
|
109 |
|
|
110 |
writer.writeStartDocument("UTF-8", "1.0") |
|
111 |
writer.writeCharacters("\n") |
|
112 |
writer.writeStartElement(rootTag) |
|
113 |
writer.writeCharacters("\n") |
|
114 |
|
|
115 |
int pb_n = 1 |
|
116 |
def tweet = [:] |
|
117 |
|
|
118 |
def isAllValidXmlChars = { s -> |
|
119 |
// xml 1.1 spec http://en.wikipedia.org/wiki/Valid_characters_in_XML |
|
120 |
if (!s ==~ /[\u0001-\uD7FF\uE000-\uFFFD\x{10000}-\x{10FFFF}]/) { |
|
121 |
// not in valid ranges |
|
122 |
return false |
|
123 |
} |
|
124 |
if (s ==~ /[\u0001-\u0008\u000b-\u000c\u000E-\u001F\u007F-\u0084\u0086-\u009F]/) { |
|
125 |
// a control character |
|
126 |
return false |
|
127 |
} |
|
128 |
|
|
129 |
// "Characters allowed but discouraged" |
|
130 |
if (s ==~ /[\uFDD0-\uFDEF\x{1FFFE}-\x{1FFFF}\x{2FFFE}–\x{2FFFF}\x{3FFFE}–\x{3FFFF}\x{4FFFE}–\x{4FFFF}\x{5FFFE}-\x{5FFFF}\x{6FFFE}-\x{6FFFF}\x{7FFFE}-\x{7FFFF}\x{8FFFE}-\x{8FFFF}\x{9FFFE}-\x{9FFFF}\x{AFFFE}-\x{AFFFF}\x{BFFFE}-\x{BFFFF}\x{CFFFE}-\x{CFFFF}\x{DFFFE}-\x{DFFFF}\x{EFFFE}-\x{EFFFF}\x{FFFFE}-\x{FFFFF}\x{10FFFE}-\x{10FFFF}]/) { |
|
131 |
return false |
|
132 |
} |
|
133 |
|
|
134 |
return true |
|
135 |
} |
|
136 |
|
|
137 |
|
|
138 |
def stripInvalidXmlCharacters = { input -> |
|
139 |
StringBuilder sb = new StringBuilder(); |
|
140 |
for (int i = 0; i < input.length(); i++) { |
|
141 |
char c = input.charAt(i); |
|
142 |
// if (XMLChar.isValid(c)) { |
|
143 |
if (isAllValidXmlChars(c)) { |
|
144 |
sb.append(c); |
|
145 |
} |
|
146 |
} |
|
147 |
|
|
148 |
return sb.toString(); |
|
149 |
} |
|
150 |
|
|
151 |
|
|
152 |
def tokenizeAs |
|
153 |
tokenizeAs = { source, matchers -> |
|
154 |
if (matchers.size() > 0) { |
|
155 |
def inputPattern = matchers[0] |
|
156 |
def outputElement = matchers[1] |
|
157 |
def outputAttribute = matchers[2] |
|
158 |
def outputAttValue = matchers[3] |
|
159 |
matchers = matchers.drop(4) |
|
160 |
|
|
161 |
def m = (source =~ inputPattern) |
|
162 |
def lastMatch = 0 |
|
163 |
while(m.find()) { |
|
164 |
tokenizeAs(source.substring(lastMatch, m.start()), matchers) |
|
165 |
if (outputAttribute == "type" && outputAttValue == "url") { |
|
166 |
writer.writeStartElement("a") |
|
167 |
writer.writeAttribute("href", m.group()) |
|
168 |
} |
|
169 |
writer.writeStartElement(outputElement) |
|
170 |
writer.writeAttribute(outputAttribute, outputAttValue) |
|
171 |
writer.writeAttribute("frpos", "NAM") |
|
172 |
writer.writeAttribute("frlemma", m.group()) |
|
173 |
writer.writeCharacters(m.group()) |
|
174 |
writer.writeEndElement() |
|
175 |
if (outputAttribute == "type" && outputAttValue == "url") { |
|
176 |
writer.writeEndElement() |
|
177 |
} |
|
178 |
lastMatch = m.end() |
|
179 |
} |
|
180 |
tokenizeAs(source.substring(lastMatch), matchers) |
|
181 |
} else { |
|
182 |
writer.writeCharacters(source) |
|
183 |
} |
|
184 |
} |
|
185 |
|
|
186 |
while (reader.readRecord()) { |
|
187 |
|
|
188 |
writer.writeCharacters(" ") |
|
189 |
writer.writeEmptyElement("pb") // <pb/> |
|
190 |
writer.writeAttribute("n", ""+pb_n++) |
|
191 |
writer.writeCharacters("\n") |
|
192 |
|
|
193 |
writer.writeCharacters(" ") |
|
194 |
writer.writeStartElement(textTag) |
|
195 |
|
|
196 |
for (String m : metadataColumnList) { |
|
197 |
m = m.trim() |
|
198 |
writer.writeAttribute(m, CleanFile.clean(reader.get(m))) |
|
199 |
} |
|
200 |
|
|
201 |
writer.writeCharacters("\n") |
|
202 |
|
|
203 |
for (String t : textColumnList) { |
|
204 |
t = t.trim() |
|
205 |
|
|
206 |
writer.writeCharacters(" ") |
|
207 |
writer.writeStartElement(t) |
|
208 |
|
|
209 |
// look for re-tweets |
|
210 |
def c = reader.get(t) |
|
211 |
def retweet = false |
|
212 |
def rt_pattern = ~/(?s:^RT @[^:]+: (.*)$)/ |
|
213 |
def rt_m = (c =~ rt_pattern) |
|
214 |
if (rt_m) retweet = true |
|
215 |
if (retweet) { // it is a re-tweet |
|
216 |
// take off and count all re-tweet prefixes |
|
217 |
def nPrefix = 0 |
|
218 |
while (retweet) { |
|
219 |
c = rt_m.group(1) |
|
220 |
rt_m = (c =~ rt_pattern) |
|
221 |
if (rt_m) { retweet = true } else { retweet = false } |
|
222 |
nPrefix++ |
|
223 |
} |
|
224 |
def crt = sprintf("RT-%d ", nPrefix)+c |
|
225 |
def n = tweet.get(crt) |
|
226 |
if (n) { |
|
227 |
tweet.put(crt, n+1) |
|
228 |
} else { |
|
229 |
tweet.put(crt, 1) |
|
230 |
} |
|
231 |
} else { |
|
232 |
tweet.put(c, 1) |
|
233 |
tokenizeAs(CleanFile.clean(c),\ |
|
234 |
[/@\p{L}+/, "w", "type", "attag",\ |
|
235 |
/#\p{L}+/, "w", "type", "hashtag",\ |
|
236 |
/((http|ftp|https):\/\/[\p{L}\-_]+(\.[\p{L}\-_]+)+([\p{L}\-\.,@?^=%&:\/~\+#]*[\p{L}\-\@?^=%&\/~\+#])?)/, "w", "type", "url"]) |
|
237 |
} |
|
238 |
writer.writeEndElement() // t |
|
239 |
writer.writeCharacters("\n") |
|
240 |
} |
|
241 |
|
|
242 |
writer.writeCharacters(" ") |
|
243 |
writer.writeEndElement() // textTag |
|
244 |
writer.writeCharacters("\n") |
|
245 |
} |
|
246 |
|
|
247 |
writer.writeEndElement() // rootTag |
|
248 |
writer.writeCharacters("\n") |
|
249 |
writer.close() |
|
250 |
output.close() |
|
251 |
reader.close() |
|
252 |
|
|
253 |
def nt = tweet.size() |
|
254 |
println sprintf("\n%d tweets uniques sur %d lus", nt, pb_n-1) |
|
255 |
|
|
256 |
if (nt > 50) { |
|
257 |
nt = 50 |
|
258 |
println "Printing first 50 most frequent [re-]tweets:" |
|
259 |
} |
|
260 |
// take(nt). |
|
261 |
tweet.sort { -it.value }.each { entry -> |
|
262 |
def l = entry.key.length() |
|
263 |
if (l > 0) { |
|
264 |
// if (entry.key.find("On n'a pas peur !!")) { |
|
265 |
println sprintf("[%d] %s...", entry.value, entry.key[0..(Math.min(entry.key.length(), 100)-1)]) |
|
266 |
// } |
|
267 |
} else { |
|
268 |
println sprintf("Empty tweet content occurring %d times.", entry.value) |
|
269 |
} |
|
270 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/MultiLineSearchReplaceInDirectoryMacro.groovy (revision 3950) | ||
---|---|---|
1 |
package org.txm.macro.txt |
|
2 |
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
3 |
// @author sheiden |
|
4 |
// |
|
5 |
// This file is part of the TXM platform. |
|
6 |
// |
|
7 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
8 |
// it under the terms of the GNU General Public License as published by |
|
9 |
// the Free Software Foundation, either version 3 of the License, or |
|
10 |
// (at your option) any later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be useful, |
|
13 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
15 |
// GNU General Public License for more details. |
|
16 |
// |
|
17 |
// You should have received a copy of the GNU General Public License |
|
18 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
19 |
// |
|
20 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
21 |
// $LastChangedRevision: 2378 $ |
|
22 |
// $LastChangedBy: sheiden $ |
|
23 |
// |
|
24 |
|
|
25 |
// STANDARD DECLARATIONS |
|
26 |
|
|
27 |
// imports |
|
28 |
import org.kohsuke.args4j.* |
|
29 |
import groovy.transform.Field |
|
30 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
31 |
|
|
32 |
// PARAMETERS |
|
33 |
|
|
34 |
// **change this parameter** |
|
35 |
@Field @Option(name="inputDirectory", usage="Dossier contenant les fichiers à modifier", widget="Folder", required=true, def='/home') |
|
36 |
inputDirectory = new File(System.getProperty("user.home"), "Bureau/voeux") |
|
37 |
|
|
38 |
// **change this parameter** |
|
39 |
@Field @Option(name="extension", usage="extension des fichiers à modifier (expression régulière)", widget="String", required=true, def='\\.txt') |
|
40 |
extension = "\\.txt" |
|
41 |
|
|
42 |
// **change this parameter** |
|
43 |
@Field @Option(name="find", usage="Expression régulière à chercher", widget="String", required=true, def='’') |
|
44 |
find = "’" |
|
45 |
|
|
46 |
// **change this parameter** |
|
47 |
@Field @Option(name="replaceWith", usage="Chaîne de remplacement", widget="String", required=false, def='\'') |
|
48 |
replaceWith = "'" |
|
49 |
|
|
50 |
// **change this parameter** |
|
51 |
@Field @Option(name="encoding", usage="Encodage des caractères des fichiers", widget="String", required=true, def='UTF-8') |
|
52 |
encoding = "utf-8" |
|
53 |
|
|
54 |
// PARAMETERS DIALOG |
|
55 |
|
|
56 |
if (!ParametersDialog.open(this)) return; |
|
57 |
|
|
58 |
// SANITY CHECK |
|
59 |
|
|
60 |
if (inputDirectory==null || find==null || replaceWith==null) { println "** MultiLineSearchReplaceInDirectoryMacro: the input directory, the regular expression to search and replacement must be specified."; return} |
|
61 |
|
|
62 |
if (!inputDirectory.exists()) { println "** MultiLineSearchReplaceInDirectoryMacro: impossible to access the '$inputDirectory' input directory."; return} |
|
63 |
|
|
64 |
// MAIN BODY |
|
65 |
|
|
66 |
println "-- working in $inputDirectory directory with files of '$extension' extension" |
|
67 |
println "-- replacing '$find' with '$replaceWith'" |
|
68 |
|
|
69 |
// find = /date="([0-9]+)-([0-9]+-[0-9]+)"/ |
|
70 |
// **change this parameter** |
|
71 |
// replaceWith = 'date="$1-$2" year="$1"' |
|
72 |
// **change this parameter** (warning: '$1', '$2'... can be interpreted by Groovy in "..." strings) |
|
73 |
// |
|
74 |
// RECETTE |
|
75 |
// <tei:pb [^>]+>\n <tei:p>[0-9]+</tei:p>\n <tei:p>[^ ]+ 18[5-6][0-9]\.</tei:p>\n |
|
76 |
|
|
77 |
def p = /$find/ |
|
78 |
|
|
79 |
inputDirectory.eachFileMatch(~/.*$extension/) { file -> // for each file matching extension |
|
80 |
println "\n-- processing: "+file.getName() |
|
81 |
def tmp = File.createTempFile("SearchReplaceInDirectoryTemp", ".tmp", file.getParentFile()) // create temporary file |
|
82 |
tmp.withWriter(encoding) { writer -> |
|
83 |
writer.print(file.getText(encoding).replaceAll(p, replaceWith)) |
|
84 |
} |
|
85 |
file.delete() |
|
86 |
tmp.renameTo(file) |
|
87 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/Taltac2XMLMacro.groovy (revision 3950) | ||
---|---|---|
1 |
package org.txm.macro.line |
|
2 |
// Copyright © 2018 - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
3 |
// @author sheiden |
|
4 |
// |
|
5 |
// This file is part of the TXM platform. |
|
6 |
// |
|
7 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
8 |
// it under the terms of the GNU General Public License as published by |
|
9 |
// the Free Software Foundation, either version 3 of the License, or |
|
10 |
// (at your option) any later version. |
|
11 |
// |
|
12 |
// The TXM platform is distributed in the hope that it will be useful, |
|
13 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
15 |
// GNU General Public License for more details. |
|
16 |
// |
|
17 |
// You should have received a copy of the GNU General Public License |
|
18 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
19 |
// |
|
20 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
21 |
// $LastChangedRevision: 2378 $ |
|
22 |
// $LastChangedBy: sheiden $ |
|
23 |
// |
|
24 |
|
|
25 |
// imports |
|
26 |
import org.kohsuke.args4j.* |
|
27 |
import groovy.transform.Field |
|
28 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
29 |
|
|
30 |
// parameters |
|
31 |
|
|
32 |
// **change this parameter** |
|
33 |
@Field @Option(name="inputFile",usage="fichier Taltac", widget="File", required=true, def='') |
|
34 |
def inputFile |
|
35 |
|
|
36 |
// **change this parameter** |
|
37 |
@Field @Option(name="characterEncoding",usage="Système d'encodage des caractères utilisé pour le fichier", widget="String", required=true, def='UTF-8') |
|
38 |
def characterEncoding = "UTF-8" |
|
39 |
|
|
40 |
@Field @Option(name="titleTag", usage="name of title ++++ tag", widget="String", required=false, def="titolo") |
|
41 |
def titleTag |
|
42 |
|
|
43 |
@Field @Option(name="addParagraphs", usage="encode each text line as a paragraph", widget="Boolean", required=true, def="true") |
|
44 |
def addParagraphs |
|
45 |
|
|
46 |
if (!ParametersDialog.open(this)) return |
|
47 |
|
|
48 |
name = inputFile.getName() |
|
49 |
idx = name.lastIndexOf(".") |
|
50 |
if (idx > 0) name = name.substring(0, idx) |
|
51 |
outputFile = new File(inputFile.getParentFile(), name+".xml") |
|
52 |
|
|
53 |
pageNum = 2 |
|
54 |
outputFile.write('') // create empty file |
|
55 |
|
|
56 |
outputFile.withWriter() { writer -> |
|
57 |
|
|
58 |
writer.println '<?xml version="1.0" encoding="UTF-8"?>\n<taltac>' |
|
59 |
|
|
60 |
def firstDoc = true |
|
61 |
def inTitle = false |
|
62 |
def currentTag = "" |
|
63 |
def metadata = "<hi>no metadata</hi>" |
|
64 |
def metadataEdited = false |
|
65 |
|
|
66 |
inputFile.eachLine(characterEncoding) { line -> |
|
67 |
|
|
68 |
line = line.replaceAll("&", "&") |
|
69 |
line = line.replaceAll("<", "<") |
|
70 |
|
|
71 |
// ****yahoobanque1 *data=31gen *autore=da *rubrica=da *ora=08 *agenzia=reuters *grafici=da |
|
72 |
group = (line =~ /^\*\*\*\*([^ ]+) (.*)/) |
|
73 |
if (group) { |
|
74 |
|
|
75 |
ident = group[0][1] |
|
76 |
properties = group[0][2] |
|
77 |
metadata=properties.replaceAll(/ ?\*([^=]+)=([^ ]+)/) { all2, propname, propvalue -> |
|
78 |
"<item><hi>$propname</hi>: $propvalue</item>\n" |
|
79 |
} |
|
80 |
metadata="<list type=\"unordered\">\n"+metadata+"</list>" |
|
81 |
metadataEdited = false |
|
82 |
properties=properties.replaceAll(/\*([^=]+)=([^ ]+)/) { all2, propname, propvalue -> |
|
83 |
"$propname=\"$propvalue\"" |
|
84 |
} |
|
85 |
if (firstDoc) { |
|
86 |
firstDoc = false |
|
87 |
writer.println "<doc ident=\"$ident\" $properties>" |
|
88 |
}else{ |
|
89 |
if (currentTag.size() > 0) { |
|
90 |
if (titleTag.size() > 0) { |
|
91 |
if (inTitle) { |
|
92 |
writer.println sprintf("</%s>", currentTag) |
|
93 |
writer.println "</head>" |
|
94 |
inTitle = false |
|
95 |
currentTag = "" |
|
96 |
}else{ |
|
97 |
writer.println sprintf("</%s>", currentTag) |
|
98 |
currentTag = "" |
|
99 |
} |
|
100 |
}else{ |
|
101 |
writer.println sprintf("</%s>", currentTag) |
|
102 |
currentTag = "" |
|
103 |
} |
|
104 |
/* |
|
105 |
if (!metadataEdited) { |
|
106 |
println sprintf ("ident = %s, currentTag = %s (%d), metadata", ident, currentTag, currentTag.size()) |
|
107 |
writer.println metadata |
|
108 |
metadataEdited = true |
|
109 |
} |
|
110 |
*/ |
|
111 |
} |
|
112 |
writer.println "</doc>\n<pb n=\"${pageNum++}\"/>\n<doc ident=\"$ident\" $properties>" |
|
113 |
} |
|
114 |
}else{ |
|
115 |
|
|
116 |
// ++++titolo |
|
117 |
group = (line =~ /^\+\+\+\+(.+)/) |
|
118 |
if (group) { |
|
119 |
|
|
120 |
tag = group[0][1] |
|
121 |
|
|
122 |
if (currentTag.size() > 0) { |
|
123 |
if (titleTag.size() > 0) { |
|
124 |
if (inTitle) { |
|
125 |
writer.println sprintf("</%s>", currentTag) |
|
126 |
writer.println "</head>" |
|
127 |
inTitle = false |
|
128 |
currentTag = "" |
|
129 |
}else{ |
|
130 |
writer.println sprintf("</%s>", currentTag) |
|
131 |
currentTag = "" |
|
132 |
} |
|
133 |
}else{ |
|
134 |
writer.println sprintf("</%s>", currentTag) |
|
135 |
currentTag = "" |
|
136 |
} |
|
137 |
if (!metadataEdited) { |
|
138 |
writer.println metadata |
|
139 |
metadataEdited = true |
|
140 |
} |
|
141 |
} |
|
142 |
if (titleTag.size() > 0) { |
|
143 |
if (tag == titleTag) { |
|
144 |
writer.println "<head>" |
|
145 |
writer.println sprintf("<%s>", tag) |
|
146 |
inTitle = true |
|
147 |
currentTag = tag |
|
148 |
}else{ |
|
149 |
writer.println sprintf("<%s>", tag) |
|
150 |
currentTag = tag |
|
151 |
} |
|
152 |
}else{ |
|
153 |
writer.println sprintf("<%s>", tag) |
|
154 |
currentTag = tag |
|
155 |
} |
|
156 |
|
|
157 |
}else{ |
|
158 |
|
|
159 |
// normal line |
|
160 |
if (addParagraphs) { |
|
161 |
writer.println "<p>$line</p>" |
|
162 |
}else{ |
|
163 |
writer.println "$line" |
|
164 |
} |
|
165 |
} |
|
166 |
} |
|
167 |
} |
|
168 |
|
|
169 |
writer.println sprintf("</%s>\n</doc>\n</taltac>\n", currentTag) |
|
170 |
writer.close() |
|
171 |
} |
|
172 |
|
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/FindMultiLineRegExpMacro.groovy (revision 3950) | ||
---|---|---|
1 |
// Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr |
|
2 |
// @author sheiden |
|
3 |
// |
|
4 |
// This file is part of the TXM platform. |
|
5 |
// |
|
6 |
// The TXM platform is free software: you can redistribute it and/or modify |
|
7 |
// it under the terms of the GNU General Public License as published by |
|
8 |
// the Free Software Foundation, either version 3 of the License, or |
|
9 |
// (at your option) any later version. |
|
10 |
// |
|
11 |
// The TXM platform is distributed in the hope that it will be useful, |
|
12 |
// but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13 |
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14 |
// GNU General Public License for more details. |
|
15 |
// |
|
16 |
// You should have received a copy of the GNU General Public License |
|
17 |
// along with the TXM platform. If not, see <http://www.gnu.org/licenses/>. |
|
18 |
// |
|
19 |
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $ |
|
20 |
// $LastChangedRevision: 2378 $ |
|
21 |
// $LastChangedBy: sheiden $ |
|
22 |
// |
|
23 |
|
|
24 |
// STANDARD DECLARATIONS |
|
25 |
|
|
26 |
import org.kohsuke.args4j.* |
|
27 |
|
|
28 |
import groovy.transform.Field |
|
29 |
|
|
30 |
import java.nio.charset.Charset |
|
31 |
|
|
32 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
33 |
import org.txm.utils.* |
|
34 |
|
|
35 |
import javax.xml.stream.* |
|
36 |
|
|
37 |
import java.net.URL |
|
38 |
import java.util.regex.Matcher; |
|
39 |
import java.util.regex.Pattern; |
|
40 |
|
|
41 |
// PARAMETERS |
|
42 |
|
|
43 |
@Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=false, def="") |
|
44 |
File inputDirectory; |
|
45 |
@Field @Option(name="regexp", usage="Regular expression to match", widget="String", required=false, def="") |
|
46 |
String regexp; |
|
47 |
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8") |
|
48 |
String encoding; |
|
49 |
|
|
50 |
// PARAMETERS |
|
51 |
|
|
52 |
if (inputDirectory == null) { |
|
53 |
if (!ParametersDialog.open(this)) return; |
|
54 |
} |
|
55 |
|
|
56 |
// SANITY CHECK |
|
57 |
|
|
58 |
if (inputDirectory==null || regexp==null) { println "** FindMultiLineRegExpMacro: the input directory and the regular expression must be specified."; return} |
|
59 |
|
|
60 |
if (!inputDirectory.exists()) { println "** FindMultiLineRegExpMacro: impossible to access the '$inputDirectory' input directory."; return} |
|
61 |
|
|
62 |
// MAIN BODY |
|
63 |
|
|
64 |
def p = /$regexp/ |
|
65 |
|
|
66 |
println "-- looking for: $regexp" |
|
67 |
|
|
68 |
for (File inputfile : inputDirectory.listFiles()) { |
|
69 |
|
|
70 |
if (inputfile.isDirectory()) continue // ignore |
|
71 |
|
|
72 |
def matcher = inputfile.getText() =~ p |
|
73 |
|
|
74 |
println "\n-- file:"+inputfile.getName() + " "+matcher.size() + " match" +((matcher.size() > 1)?"s":"") |
|
75 |
|
|
76 |
if (matcher.size() > 0) { |
|
77 |
matcher.each { println it } |
|
78 |
} |
|
79 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/FindRegExpMacro.groovy (revision 3950) | ||
---|---|---|
1 |
package org.txm.macro |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
|
|
5 |
import groovy.transform.Field |
|
6 |
|
|
7 |
import java.nio.charset.Charset |
|
8 |
|
|
9 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
10 |
import org.txm.utils.* |
|
11 |
|
|
12 |
import javax.xml.stream.* |
|
13 |
|
|
14 |
import java.net.URL |
|
15 |
import java.util.regex.Matcher; |
|
16 |
import java.util.regex.Pattern; |
|
17 |
|
|
18 |
@Field @Option(name="inputDirectory",usage="TXT directory", widget="Folder", required=false, def="directory path") |
|
19 |
File inputDirectory; |
|
20 |
@Field @Option(name="regexp",usage="Regular expression to match", widget="String", required=false, def="regular expression") |
|
21 |
String regexp; |
|
22 |
@Field @Option(name="encoding",usage="File encoding", widget="String", required=false, def="UTF-8") |
|
23 |
String encoding; |
|
24 |
|
|
25 |
if (!ParametersDialog.open(this)) return; |
|
26 |
|
|
27 |
def p = /$regexp/ |
|
28 |
|
|
29 |
println "processing: "+inputDirectory.listFiles() |
|
30 |
for (File inputfile : inputDirectory.listFiles()) { |
|
31 |
if (inputfile.isDirectory()) continue // ignore |
|
32 |
|
|
33 |
def lines = [] |
|
34 |
inputfile.eachLine("UTF-8") { line, n -> |
|
35 |
def m = line =~ p |
|
36 |
if ( m.size() > 0) { |
|
37 |
lines << " line $n: $line" |
|
38 |
} |
|
39 |
} |
|
40 |
|
|
41 |
println inputfile.getName() + " "+lines.size() + " match" +((lines.size() > 1)?"s":"") |
|
42 |
if (lines.size() > 0) { |
|
43 |
for (String s : lines ) println s |
|
44 |
} |
|
45 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/PennTreebank2TIGERMacro.groovy (revision 3950) | ||
---|---|---|
1 |
// STANDARD DECLARATIONS |
|
2 |
package org.txm.macro |
|
3 |
|
|
4 |
import org.kohsuke.args4j.* |
|
5 |
import groovy.transform.Field |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
|
|
8 |
import ims.tiger.importfilter.* |
|
9 |
import ims.tiger.importfilter.bracketing.* |
|
10 |
import org.apache.log4j.* |
|
11 |
|
|
12 |
// BEGINNING OF PARAMETERS |
|
13 |
|
|
14 |
// Declare each parameter here |
|
15 |
// (available widget types: Query, File, Folder, String, Text, Boolean, Integer, Float and Date) |
|
16 |
|
|
17 |
@Field @Option(name="SourceFile", usage="SourceFilename", widget="File", required=false, def="input.mrg") |
|
18 |
def SourceFile |
|
19 |
|
|
20 |
@Field @Option(name="TargetFile", usage="TargetFilename", widget="File", required=false, def="output.xml") |
|
21 |
def TargetFile |
|
22 |
|
|
23 |
@Field @Option(name="XMLTargetID", usage="XMLTargetID", widget="String", required=false, def="XMLTargetID") |
|
24 |
def XMLTargetID |
|
25 |
|
|
26 |
@Field @Option(name="SourceFolder", usage="SourceFolder (leave empty if SourceFile set)", widget="Folder", required=false, def="input <remove if SourceFile set>") |
|
27 |
def SourceFolder |
|
28 |
|
|
29 |
@Field @Option(name="TargetFolder", usage="TargetFolder (leave empty if TargetFile set)", widget="Folder", required=false, def="output") |
|
30 |
def TargetFolder |
|
31 |
|
|
32 |
@Field @Option(name="MaximumNumberOfSentences", usage="MaximumNumberOfSentences", widget="Integer", required=true, def="0") |
|
33 |
def MaximumNumberOfSentences |
|
34 |
|
|
35 |
// Open the parameters input dialog box |
|
36 |
if (!ParametersDialog.open(this)) return |
|
37 |
|
|
38 |
// END OF PARAMETERS |
|
39 |
|
|
40 |
BasicConfigurator.configure() |
|
41 |
|
|
42 |
ImportFilter filter = new MyUPennFilter() |
|
43 |
ImportFilterHandler handler = new SilentImportFilterHandler() |
|
44 |
filter.setImportFilterHandler(handler) |
|
45 |
|
|
46 |
filter.setCompression(false) |
|
47 |
filter.setSchemaFilename(System.getProperty("user.home")+"/TXM/xml/xsd/tiger/TigerXML.xsd") |
|
48 |
filter.setMaximumNumberOfSentences(MaximumNumberOfSentences-1) |
|
49 |
|
|
50 |
if (SourceFolder == null) { |
|
51 |
|
|
52 |
def SourceFilename = SourceFile.getAbsolutePath() |
|
53 |
def TargetFilename = TargetFile.getAbsolutePath() |
|
54 |
|
|
55 |
filter.setSourceFilename(SourceFilename) |
|
56 |
filter.setXMLTargetFilename(TargetFilename) |
|
57 |
filter.setXMLTargetID(XMLTargetID) |
|
58 |
println "Converting '$SourceFilename' to '$TargetFilename'" |
|
59 |
|
|
60 |
try { |
|
61 |
filter.startConversion() |
|
62 |
} catch (Exception e) |
|
63 |
{ e.printStackTrace() |
|
64 |
} |
|
65 |
} else { |
|
66 |
|
|
67 |
for (File inputfile : SourceFolder.listFiles()) { |
|
68 |
if (inputfile.isDirectory()) continue // ignore |
|
69 |
|
|
70 |
fileNameE = inputfile.getName() |
|
71 |
i = fileNameE.lastIndexOf('.') |
|
72 |
if (i > 0) extension = fileNameE.substring(i+1) else extension = "" |
|
73 |
fileName = fileNameE.substring(0, i) |
|
74 |
fileId = fileName |
|
75 |
input = inputfile.getAbsolutePath() |
|
76 |
output = TargetFolder.getAbsolutePath()+"/"+fileName+".xml" |
|
77 |
println "Converting '"+input+"' to '"+output+"'" |
|
78 |
filter.setSourceFilename(input) |
|
79 |
filter.setXMLTargetFilename(output) |
|
80 |
filter.setXMLTargetID(fileId) |
|
81 |
|
|
82 |
try { |
|
83 |
filter.startConversion() |
|
84 |
} catch (Exception e) |
|
85 |
{ e.printStackTrace() |
|
86 |
} |
|
87 |
} |
|
88 |
} |
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/PennTreebank2TIGERMacro.readme (revision 3950) | ||
---|---|---|
1 |
PennTreebank2TIGER macro |
|
2 |
======================== |
|
3 |
2016-07, sheiden |
|
4 |
|
|
5 |
* Install |
|
6 |
|
|
7 |
- unzip PennTreebank2TIGER.zip |
|
8 |
- copy TigerXML.xsd in $HOME/TXM/xml/xsd/tiger/TigerXML.xsd |
|
9 |
- copy PennTreebank2TIGERMacro.groovy in $HOME/TXM/scripts/macro/org/txm/macro |
|
10 |
- refresh macro view |
|
11 |
|
|
12 |
* Parameters |
|
13 |
|
|
14 |
- SourceFile: drives one file conversion process mode |
|
15 |
- SourceFolder: drives several files in a folder conversion process mode |
|
16 |
-> fill only the field to be used |
|
17 |
- TargetFile: if SourceFile then use that target file |
|
18 |
- XMLTargetID: if SourceFile then use that target ID |
|
19 |
- TargetFolder: : if SourceFolder then use that target folder to save targets |
|
20 |
- target files are named from the source filename with the extension stripped and '.xml' added |
|
21 |
- target ID are named from the source filename with the extension stripped |
|
22 |
- MaximumNumberOfSentences: maximum number of sentences to convert per file (set to '0' to convert all sentences) |
|
23 |
|
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/TigerXML.xsd (revision 3950) | ||
---|---|---|
1 |
<?xml version="1.0"?> |
|
2 |
|
|
3 |
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"> |
|
4 |
|
|
5 |
<!-- ================================================================== |
|
6 |
XML Schema for the TIGER-XML format |
|
7 |
http://www.ims.uni-stuttgart.de/projekte/TIGER/public/TigerXML.xsd |
|
8 |
================================================================== |
|
9 |
TIGER Project, Wolfgang Lezius |
|
10 |
IMS, University of Stuttgart, 04/01/2003 |
|
11 |
================================================================== --> |
|
12 |
|
|
13 |
|
|
14 |
<!-- ====================================================== |
|
15 |
INCLUDES DECLARATION OF THE HEADER |
|
16 |
====================================================== --> |
|
17 |
<xsd:include schemaLocation="TigerXMLHeader.xsd" /> |
|
18 |
|
|
19 |
|
|
20 |
<!-- ====================================================== |
|
21 |
INCLUDES DECLARATION OF SUBCORPORA AND SENTENCES |
|
22 |
====================================================== --> |
|
23 |
<xsd:include schemaLocation="TigerXMLSubcorpus.xsd" /> |
|
24 |
|
|
25 |
|
|
26 |
<!-- ====================================================== |
|
27 |
DECLARATION OF THE CORPUS DOCUMENT |
|
28 |
====================================================== --> |
|
29 |
|
|
30 |
<!-- declaration of the root element: corpus --> |
|
31 |
|
|
32 |
<xsd:element name="corpus"> |
|
33 |
|
|
34 |
<xsd:complexType> |
|
35 |
|
|
36 |
<xsd:sequence> |
|
37 |
|
|
38 |
<xsd:choice> |
|
39 |
<!-- header of the document is optional --> |
|
40 |
<xsd:element name="head" type="headType" minOccurs="0" maxOccurs="1"/> |
|
41 |
</xsd:choice> |
|
42 |
|
|
43 |
<xsd:element name="body" type="bodyType" minOccurs="1" maxOccurs="1"/> |
|
44 |
|
|
45 |
</xsd:sequence> |
|
46 |
|
|
47 |
<!-- corpus ID --> |
|
48 |
<xsd:attribute name="id" type="idType" use="required" /> |
|
49 |
|
|
50 |
<!-- optional attribute: TigerXML version; used by TIGERSearch only --> |
|
51 |
<xsd:attribute name="version" type="xsd:string" use="optional" /> |
|
52 |
|
|
53 |
</xsd:complexType> |
|
54 |
|
|
55 |
</xsd:element> |
|
56 |
|
|
57 |
|
|
58 |
<!-- declaration of the body type --> |
|
59 |
|
|
60 |
<xsd:complexType name="bodyType"> |
|
61 |
|
|
62 |
<xsd:choice minOccurs="1" maxOccurs="unbounded"> |
|
63 |
<xsd:element name="subcorpus" type="subcorpusType" minOccurs="1" maxOccurs="1"/> |
|
64 |
<xsd:element name="s" type="sentenceType" minOccurs="1" maxOccurs="1"/> |
|
65 |
</xsd:choice> |
|
66 |
|
|
67 |
</xsd:complexType> |
|
68 |
|
|
69 |
|
|
70 |
</xsd:schema> |
|
0 | 71 |
Formats disponibles : Unified diff