Révision 3950

TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/csv/CSV2XMLTweetAwareMacro.groovy (revision 3950)
1
// Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr
2
// @author sheiden
3
//
4
// This file is part of the TXM platform.
5
//
6
// The TXM platform is free software: you can redistribute it and/or modify
7
// it under the terms of the GNU General Public License as published by
8
// the Free Software Foundation, either version 3 of the License, or
9
// (at your option) any later version.
10
//
11
// The TXM platform is distributed in the hope that it will be useful,
12
// but WITHOUT ANY WARRANTY; without even the implied warranty of
13
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
// GNU General Public License for more details.
15
//
16
// You should have received a copy of the GNU General Public License
17
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
18
//
19
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
20
// $LastChangedRevision: 2378 $
21
// $LastChangedBy: sheiden $
22
//
23

  
24
package org.txm.macro.csv
25

  
26
import org.kohsuke.args4j.*
27
import groovy.transform.Field
28
import java.nio.charset.Charset
29
import org.txm.rcpapplication.swt.widget.parameters.*
30
import org.txm.utils.*
31
import org.txm.importer.*
32
import javax.xml.stream.*
33
import java.net.URL
34
//import org.apache.xerces.util
35

  
36
@Field @Option(name="inputFile",usage="CSV File", widget="File", required=false, def="/home/sheiden/Corpus/src/charlie/csv/fr_07jan.csv")
37
File inputFile;
38

  
39
@Field @Option(name="columnSeparator",usage="column columnSeparator", widget="String", required=false, def=",")
40
def columnSeparator;
41

  
42
@Field @Option(name="characterEncoding",usage="File characterEncoding", widget="String", required=false, def="UTF-8")
43
def characterEncoding;
44

  
45
@Field @Option(name="rootTag",usage="Root tag name", widget="String", required=false, def="tweets")
46
def rootTag;
47

  
48
@Field @Option(name="textTag",usage="Text tag name", widget="String", required=false, def="tweet")
49
def textTag;
50

  
51
@Field @Option(name="metadataColumnList",usage="metadataColumnList column list separated with comma", widget="String", required=false, def="tweet-id-str,job-id,created-at,from-user,from-user-id-str,from-user-name,from-user-fullname,from-user-followers,from-user-following,from-user-favorites,from-user-tweets,from-user-timezone,to-user,to-user-id-str,to-user-name,source,location-geo,location-geo-0,location-geo-1,iso-language,analysis-state")
52
def metadataColumnList;
53

  
54
@Field @Option(name="textColumnList",usage="textColumnList column list separated with comma", widget="String", required=false, def="text")
55
def textColumnList;
56

  
57
if (!ParametersDialog.open(this)) return;
58

  
59
metadataColumnList = metadataColumnList.split(",")
60
textColumnList = textColumnList.split(",")
61
textTag = textTag.trim()
62
rootTag = rootTag.trim()
63

  
64
CsvReader reader = new CsvReader(inputFile.getAbsolutePath(), columnSeparator.charAt(0), Charset.forName(characterEncoding));
65
if (!reader.readHeaders()) {
66
	println "Error: no header"
67
	return
68
}
69

  
70
def headers = Arrays.asList(reader.getHeaders())
71
headers.eachWithIndex {str, index -> headers[index] = str.replaceAll(/_/, "-") }
72
reader.setHeaders(headers as String[])
73

  
74
println "Root tag: $rootTag"
75
println "Text tag: $textTag"
76
println "Metadata column(s): $metadataColumnList"
77
println "Text column(s): $textColumnList"
78
println "Processing..."
79

  
80
def ok = true
81
for (String m : metadataColumnList) {
82
	m = m.trim()
83
	if (!headers.contains(m)) {
84
		println "Error: missing metadata column named '$m'"
85
		ok = false
86
	}
87
}
88
for (String t : textColumnList) {
89
	t = t.trim()
90
	if (!headers.contains(t)) {
91
		println "Error: missing text column named '$t'"
92
		ok = false
93
	}
94
}
95

  
96
if (!ok) {
97
	println "** CSV2XML: Column(s) missing, aborting..."
98
	return
99
}
100

  
101
String name = inputFile.getName()
102
int idx = name.indexOf(".")
103
if (idx > 0) name = name.substring(0, idx)
104
File outputfile = new File(inputFile.getParentFile(), name+".xml")
105

  
106
XMLOutputFactory factory = XMLOutputFactory.newInstance()
107
Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8")
108
XMLStreamWriter writer = factory.createXMLStreamWriter(output)
109

  
110
writer.writeStartDocument("UTF-8", "1.0")
111
writer.writeCharacters("\n")
112
writer.writeStartElement(rootTag)
113
writer.writeCharacters("\n")
114

  
115
int pb_n = 1
116
def tweet = [:]
117

  
118
def isAllValidXmlChars = { s ->
119
// xml 1.1 spec http://en.wikipedia.org/wiki/Valid_characters_in_XML
120
if (!s ==~ /[\u0001-\uD7FF\uE000-\uFFFD\x{10000}-\x{10FFFF}]/) {
121
  // not in valid ranges
122
  return false
123
}
124
if (s ==~ /[\u0001-\u0008\u000b-\u000c\u000E-\u001F\u007F-\u0084\u0086-\u009F]/) {
125
  // a control character
126
  return false
127
}
128

  
129
// "Characters allowed but discouraged"
130
if (s ==~ /[\uFDD0-\uFDEF\x{1FFFE}-\x{1FFFF}\x{2FFFE}–\x{2FFFF}\x{3FFFE}–\x{3FFFF}\x{4FFFE}–\x{4FFFF}\x{5FFFE}-\x{5FFFF}\x{6FFFE}-\x{6FFFF}\x{7FFFE}-\x{7FFFF}\x{8FFFE}-\x{8FFFF}\x{9FFFE}-\x{9FFFF}\x{AFFFE}-\x{AFFFF}\x{BFFFE}-\x{BFFFF}\x{CFFFE}-\x{CFFFF}\x{DFFFE}-\x{DFFFF}\x{EFFFE}-\x{EFFFF}\x{FFFFE}-\x{FFFFF}\x{10FFFE}-\x{10FFFF}]/) {
131
  return false
132
}
133

  
134
return true
135
}
136

  
137

  
138
def stripInvalidXmlCharacters = { input ->
139
    StringBuilder sb = new StringBuilder();
140
    for (int i = 0; i < input.length(); i++) {
141
        char c = input.charAt(i);
142
//        if (XMLChar.isValid(c)) {
143
        if (isAllValidXmlChars(c)) {
144
            sb.append(c);
145
        }
146
    }
147

  
148
    return sb.toString();
149
}
150

  
151

  
152
def tokenizeAs
153
tokenizeAs = { source, matchers ->
154
	if (matchers.size() > 0) {
155
		def inputPattern = matchers[0]
156
		def outputElement = matchers[1]
157
		def outputAttribute = matchers[2]
158
		def outputAttValue = matchers[3]
159
		matchers = matchers.drop(4)
160

  
161
		def m = (source =~ inputPattern)
162
		def lastMatch = 0
163
		while(m.find()) {
164
		   tokenizeAs(source.substring(lastMatch, m.start()), matchers)
165
		   if (outputAttribute == "type" && outputAttValue == "url") {
166
		   		writer.writeStartElement("a")
167
		   		writer.writeAttribute("href", m.group())
168
		   }
169
		   writer.writeStartElement(outputElement)
170
		   writer.writeAttribute(outputAttribute, outputAttValue)
171
		   writer.writeAttribute("frpos", "NAM")
172
		   writer.writeAttribute("frlemma", m.group())
173
		   writer.writeCharacters(m.group())
174
		   writer.writeEndElement()
175
		   if (outputAttribute == "type" && outputAttValue == "url") {
176
		   		writer.writeEndElement()
177
		   }
178
		   lastMatch = m.end()
179
		}
180
		tokenizeAs(source.substring(lastMatch), matchers)
181
	} else {
182
		writer.writeCharacters(source)
183
	}
184
}
185

  
186
while (reader.readRecord()) {
187
	
188
    writer.writeCharacters("  ")
189
	writer.writeEmptyElement("pb") // <pb/>
190
	writer.writeAttribute("n", ""+pb_n++)
191
	writer.writeCharacters("\n")
192

  
193
    writer.writeCharacters("  ")
194
	writer.writeStartElement(textTag)
195
	
196
	for (String m : metadataColumnList) {
197
		m = m.trim()
198
		writer.writeAttribute(m, CleanFile.clean(reader.get(m)))
199
	}
200

  
201
    writer.writeCharacters("\n")
202

  
203
	for (String t : textColumnList) {
204
		t = t.trim()
205
		
206
    	writer.writeCharacters("    ")
207
		writer.writeStartElement(t)
208

  
209
		// look for re-tweets
210
		def c = reader.get(t)
211
		def retweet = false
212
		def rt_pattern = ~/(?s:^RT @[^:]+: (.*)$)/
213
		def rt_m = (c =~ rt_pattern)
214
		if (rt_m) retweet = true 
215
		if (retweet) { // it is a re-tweet
216
			// take off and count all re-tweet prefixes
217
			def nPrefix = 0
218
			while (retweet) {
219
				c = rt_m.group(1)
220
				rt_m = (c =~ rt_pattern)
221
				if (rt_m) { retweet = true } else { retweet = false }
222
				nPrefix++
223
			}
224
			def crt = sprintf("RT-%d ", nPrefix)+c
225
			def n = tweet.get(crt)
226
			if (n) {
227
				tweet.put(crt, n+1)
228
				} else {
229
				tweet.put(crt, 1)
230
			}
231
		} else {
232
			tweet.put(c, 1)
233
			tokenizeAs(CleanFile.clean(c),\
234
					   [/@\p{L}+/, "w", "type", "attag",\
235
						/#\p{L}+/, "w", "type", "hashtag",\
236
						/((http|ftp|https):\/\/[\p{L}\-_]+(\.[\p{L}\-_]+)+([\p{L}\-\.,@?^=%&amp;:\/~\+#]*[\p{L}\-\@?^=%&amp;\/~\+#])?)/, "w", "type", "url"])
237
		}
238
		writer.writeEndElement() // t
239
		writer.writeCharacters("\n")
240
	}
241
	
242
    writer.writeCharacters("  ")
243
	writer.writeEndElement() // textTag
244
	writer.writeCharacters("\n")
245
}
246

  
247
writer.writeEndElement() // rootTag
248
writer.writeCharacters("\n")
249
writer.close()
250
output.close()
251
reader.close()
252

  
253
def nt = tweet.size()
254
println sprintf("\n%d tweets uniques sur %d lus", nt, pb_n-1)
255

  
256
if (nt > 50) {
257
	nt = 50
258
	println "Printing first 50 most frequent [re-]tweets:"
259
}
260
// take(nt).
261
tweet.sort { -it.value }.each { entry ->
262
	def l = entry.key.length()
263
	if (l > 0) {
264
//		if (entry.key.find("On n'a pas peur !!")) {
265
		println sprintf("[%d] %s...", entry.value, entry.key[0..(Math.min(entry.key.length(), 100)-1)])
266
//		}
267
	} else {
268
	println sprintf("Empty tweet content occurring %d times.", entry.value)
269
	}
270
}
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/MultiLineSearchReplaceInDirectoryMacro.groovy (revision 3950)
1
package org.txm.macro.txt
2
// Copyright © - ENS de Lyon - http://textometrie.ens-lyon.fr
3
// @author sheiden
4
//
5
// This file is part of the TXM platform.
6
//
7
// The TXM platform is free software: you can redistribute it and/or modify
8
// it under the terms of the GNU General Public License as published by
9
// the Free Software Foundation, either version 3 of the License, or
10
// (at your option) any later version.
11
//
12
// The TXM platform is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
// GNU General Public License for more details.
16
//
17
// You should have received a copy of the GNU General Public License
18
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
19
//
20
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
21
// $LastChangedRevision: 2378 $
22
// $LastChangedBy: sheiden $
23
//
24

  
25
// STANDARD DECLARATIONS
26

  
27
// imports
28
import org.kohsuke.args4j.*
29
import groovy.transform.Field
30
import org.txm.rcpapplication.swt.widget.parameters.*
31

  
32
// PARAMETERS
33

  
34
// **change this parameter**
35
@Field @Option(name="inputDirectory", usage="Dossier contenant les fichiers à modifier", widget="Folder", required=true, def='/home')
36
		inputDirectory = new File(System.getProperty("user.home"), "Bureau/voeux")
37

  
38
// **change this parameter**
39
@Field @Option(name="extension", usage="extension des fichiers à modifier (expression régulière)", widget="String", required=true, def='\\.txt')
40
		extension = "\\.txt"
41

  
42
// **change this parameter**
43
@Field @Option(name="find", usage="Expression régulière à chercher", widget="String", required=true, def='’')
44
		find = "’"
45

  
46
// **change this parameter**
47
@Field @Option(name="replaceWith", usage="Chaîne de remplacement", widget="String", required=false, def='\'')
48
		replaceWith = "'"
49

  
50
// **change this parameter**
51
@Field @Option(name="encoding", usage="Encodage des caractères des fichiers", widget="String", required=true, def='UTF-8')
52
		encoding = "utf-8"
53

  
54
// PARAMETERS DIALOG
55

  
56
if (!ParametersDialog.open(this)) return;
57

  
58
// SANITY CHECK
59

  
60
if (inputDirectory==null || find==null || replaceWith==null) { println "** MultiLineSearchReplaceInDirectoryMacro: the input directory, the regular expression to search and replacement must be specified."; return}
61

  
62
if (!inputDirectory.exists()) { println "** MultiLineSearchReplaceInDirectoryMacro: impossible to access the '$inputDirectory' input directory."; return}
63

  
64
// MAIN BODY
65

  
66
println "-- working in $inputDirectory directory with files of '$extension' extension"
67
println "-- replacing '$find' with '$replaceWith'"
68

  
69
// find = /date="([0-9]+)-([0-9]+-[0-9]+)"/
70
// **change this parameter**
71
// replaceWith = 'date="$1-$2" year="$1"'
72
// **change this parameter** (warning: '$1', '$2'... can be interpreted by Groovy in "..." strings)
73
//
74
// RECETTE
75
// <tei:pb [^>]+>\n   <tei:p>[0-9]+</tei:p>\n   <tei:p>[^ ]+ 18[5-6][0-9]\.</tei:p>\n
76

  
77
def p = /$find/
78

  
79
inputDirectory.eachFileMatch(~/.*$extension/) { file ->               // for each file matching extension
80
	println "\n-- processing: "+file.getName()
81
	def tmp = File.createTempFile("SearchReplaceInDirectoryTemp", ".tmp", file.getParentFile()) // create temporary file
82
	tmp.withWriter(encoding) { writer ->
83
		writer.print(file.getText(encoding).replaceAll(p, replaceWith))
84
	}
85
	file.delete()
86
	tmp.renameTo(file)
87
}
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/Taltac2XMLMacro.groovy (revision 3950)
1
package org.txm.macro.line
2
// Copyright © 2018 - ENS de Lyon - http://textometrie.ens-lyon.fr
3
// @author sheiden
4
//
5
// This file is part of the TXM platform.
6
//
7
// The TXM platform is free software: you can redistribute it and/or modify
8
// it under the terms of the GNU General Public License as published by
9
// the Free Software Foundation, either version 3 of the License, or
10
// (at your option) any later version.
11
//
12
// The TXM platform is distributed in the hope that it will be useful,
13
// but WITHOUT ANY WARRANTY; without even the implied warranty of
14
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
// GNU General Public License for more details.
16
//
17
// You should have received a copy of the GNU General Public License
18
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
19
//
20
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
21
// $LastChangedRevision: 2378 $
22
// $LastChangedBy: sheiden $
23
//
24

  
25
// imports
26
import org.kohsuke.args4j.*
27
import groovy.transform.Field
28
import org.txm.rcpapplication.swt.widget.parameters.*
29

  
30
// parameters
31

  
32
// **change this parameter**
33
@Field @Option(name="inputFile",usage="fichier Taltac", widget="File", required=true, def='')
34
def inputFile
35

  
36
// **change this parameter**
37
@Field @Option(name="characterEncoding",usage="Système d'encodage des caractères utilisé pour le fichier", widget="String", required=true, def='UTF-8')
38
def characterEncoding = "UTF-8"
39

  
40
@Field @Option(name="titleTag", usage="name of title ++++ tag", widget="String", required=false, def="titolo")
41
def titleTag
42

  
43
@Field @Option(name="addParagraphs", usage="encode each text line as a paragraph", widget="Boolean", required=true, def="true")
44
def addParagraphs
45

  
46
if (!ParametersDialog.open(this)) return
47

  
48
name = inputFile.getName()
49
idx = name.lastIndexOf(".")
50
if (idx > 0) name = name.substring(0, idx)
51
outputFile = new File(inputFile.getParentFile(), name+".xml")
52

  
53
pageNum = 2
54
outputFile.write('') // create empty file
55

  
56
outputFile.withWriter() { writer ->
57

  
58
	writer.println '<?xml version="1.0" encoding="UTF-8"?>\n<taltac>'
59

  
60
	def firstDoc = true
61
	def inTitle = false
62
	def currentTag = ""
63
	def metadata = "<hi>no metadata</hi>"
64
	def metadataEdited = false
65

  
66
	inputFile.eachLine(characterEncoding) { line ->
67

  
68
		line = line.replaceAll("&", "&amp;")
69
		line = line.replaceAll("<", "&lt;")
70

  
71
		// ****yahoobanque1 *data=31gen *autore=da *rubrica=da *ora=08 *agenzia=reuters *grafici=da
72
		group = (line =~ /^\*\*\*\*([^ ]+) (.*)/)
73
		if (group) {
74

  
75
			ident = group[0][1]
76
			properties = group[0][2]
77
			metadata=properties.replaceAll(/ ?\*([^=]+)=([^ ]+)/) { all2, propname, propvalue ->
78
				"<item><hi>$propname</hi>: $propvalue</item>\n"
79
			}
80
			metadata="<list type=\"unordered\">\n"+metadata+"</list>"
81
			metadataEdited = false
82
			properties=properties.replaceAll(/\*([^=]+)=([^ ]+)/) { all2, propname, propvalue ->
83
				"$propname=\"$propvalue\""
84
			}
85
			if (firstDoc) {
86
				firstDoc = false
87
				writer.println "<doc ident=\"$ident\" $properties>"
88
			}else{
89
				if (currentTag.size() > 0) {
90
					if (titleTag.size() > 0) {
91
						if (inTitle) {
92
							writer.println sprintf("</%s>", currentTag)
93
							writer.println "</head>"
94
							inTitle = false
95
							currentTag = ""
96
						}else{
97
							writer.println sprintf("</%s>", currentTag)
98
							currentTag = ""
99
						}
100
					}else{
101
						writer.println sprintf("</%s>", currentTag)
102
						currentTag = ""
103
					}
104
/*
105
					if (!metadataEdited) {
106
println sprintf ("ident = %s, currentTag = %s (%d), metadata", ident, currentTag, currentTag.size())
107
						writer.println metadata
108
						metadataEdited = true
109
					}
110
*/
111
				}
112
				writer.println "</doc>\n<pb n=\"${pageNum++}\"/>\n<doc ident=\"$ident\" $properties>"
113
			}
114
		}else{
115

  
116
			// ++++titolo
117
			group = (line =~ /^\+\+\+\+(.+)/)
118
			if (group) {
119

  
120
				tag = group[0][1]
121

  
122
				if (currentTag.size() > 0) {
123
					if (titleTag.size() > 0) {
124
						if (inTitle) {
125
							writer.println sprintf("</%s>", currentTag)
126
							writer.println "</head>"
127
							inTitle = false
128
							currentTag = ""
129
						}else{
130
							writer.println sprintf("</%s>", currentTag)
131
							currentTag = ""
132
						}
133
					}else{
134
						writer.println sprintf("</%s>", currentTag)
135
						currentTag = ""
136
					}
137
					if (!metadataEdited) {
138
						writer.println metadata
139
						metadataEdited = true
140
					}
141
				}
142
				if (titleTag.size() > 0) {
143
					if (tag == titleTag) {
144
						writer.println "<head>"
145
						writer.println sprintf("<%s>", tag)
146
						inTitle = true
147
						currentTag = tag
148
					}else{
149
						writer.println sprintf("<%s>", tag)
150
						currentTag = tag
151
					}
152
				}else{
153
					writer.println sprintf("<%s>", tag)
154
					currentTag = tag
155
				}
156

  
157
			}else{
158

  
159
				// normal line
160
				if (addParagraphs) {
161
					writer.println "<p>$line</p>"
162
				}else{
163
					writer.println "$line"
164
				}
165
			}
166
		}
167
	}
168

  
169
	writer.println sprintf("</%s>\n</doc>\n</taltac>\n", currentTag)
170
	writer.close()
171
}
172

  
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/FindMultiLineRegExpMacro.groovy (revision 3950)
1
// Copyright © 2015 - ENS de Lyon - http://textometrie.ens-lyon.fr
2
// @author sheiden
3
//
4
// This file is part of the TXM platform.
5
//
6
// The TXM platform is free software: you can redistribute it and/or modify
7
// it under the terms of the GNU General Public License as published by
8
// the Free Software Foundation, either version 3 of the License, or
9
// (at your option) any later version.
10
//
11
// The TXM platform is distributed in the hope that it will be useful,
12
// but WITHOUT ANY WARRANTY; without even the implied warranty of
13
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
// GNU General Public License for more details.
15
//
16
// You should have received a copy of the GNU General Public License
17
// along with the TXM platform.  If not, see <http://www.gnu.org/licenses/>.
18
//
19
// $LastChangedDate: 2013-05-02 11:28:42 +0200 (jeu., 02 mai 2013) $
20
// $LastChangedRevision: 2378 $
21
// $LastChangedBy: sheiden $
22
//
23

  
24
// STANDARD DECLARATIONS
25

  
26
import org.kohsuke.args4j.*
27

  
28
import groovy.transform.Field
29

  
30
import java.nio.charset.Charset
31

  
32
import org.txm.rcpapplication.swt.widget.parameters.*
33
import org.txm.utils.*
34

  
35
import javax.xml.stream.*
36

  
37
import java.net.URL
38
import java.util.regex.Matcher;
39
import java.util.regex.Pattern;
40

  
41
// PARAMETERS
42

  
43
@Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=false, def="")
44
File inputDirectory;
45
@Field @Option(name="regexp", usage="Regular expression to match", widget="String", required=false, def="")
46
String regexp;
47
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8")
48
String encoding;
49

  
50
// PARAMETERS
51

  
52
if (inputDirectory == null) {
53
	if (!ParametersDialog.open(this)) return;
54
}
55

  
56
// SANITY CHECK
57

  
58
if (inputDirectory==null || regexp==null) { println "** FindMultiLineRegExpMacro: the input directory and the regular expression must be specified."; return}
59

  
60
if (!inputDirectory.exists()) { println "** FindMultiLineRegExpMacro: impossible to access the '$inputDirectory' input directory."; return}
61

  
62
// MAIN BODY
63

  
64
def p = /$regexp/
65

  
66
println "-- looking for: $regexp"
67

  
68
for (File inputfile : inputDirectory.listFiles()) {
69

  
70
	if (inputfile.isDirectory()) continue // ignore
71

  
72
	def matcher = inputfile.getText() =~ p
73
	
74
	println "\n-- file:"+inputfile.getName() + " "+matcher.size() + " match" +((matcher.size() > 1)?"s":"")
75

  
76
	if (matcher.size() > 0) {
77
		matcher.each { println it }
78
	}
79
}
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/txt/FindRegExpMacro.groovy (revision 3950)
1
package org.txm.macro
2

  
3
import org.kohsuke.args4j.*
4

  
5
import groovy.transform.Field
6

  
7
import java.nio.charset.Charset
8

  
9
import org.txm.rcpapplication.swt.widget.parameters.*
10
import org.txm.utils.*
11

  
12
import javax.xml.stream.*
13

  
14
import java.net.URL
15
import java.util.regex.Matcher;
16
import java.util.regex.Pattern;
17

  
18
@Field @Option(name="inputDirectory",usage="TXT directory", widget="Folder", required=false, def="directory path")
19
File inputDirectory;
20
@Field @Option(name="regexp",usage="Regular expression to match", widget="String", required=false, def="regular expression")
21
String regexp;
22
@Field @Option(name="encoding",usage="File encoding", widget="String", required=false, def="UTF-8")
23
String encoding;
24

  
25
if (!ParametersDialog.open(this)) return;
26

  
27
def p = /$regexp/
28

  
29
println "processing: "+inputDirectory.listFiles()
30
for (File inputfile : inputDirectory.listFiles()) {
31
	if (inputfile.isDirectory()) continue // ignore
32
	
33
	def lines = []
34
	inputfile.eachLine("UTF-8") { line, n ->
35
		def m = line =~ p
36
		if ( m.size() > 0) {
37
			lines << "  line $n: $line"
38
		}
39
	}
40
	
41
	println inputfile.getName() + " "+lines.size() + " match" +((lines.size() > 1)?"s":"")
42
	if (lines.size() > 0) {
43
		for (String s : lines ) println s
44
	}
45
}
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/PennTreebank2TIGERMacro.groovy (revision 3950)
1
// STANDARD DECLARATIONS
2
package org.txm.macro
3

  
4
import org.kohsuke.args4j.*
5
import groovy.transform.Field
6
import org.txm.rcpapplication.swt.widget.parameters.*
7

  
8
import ims.tiger.importfilter.*
9
import ims.tiger.importfilter.bracketing.*
10
import org.apache.log4j.*
11

  
12
// BEGINNING OF PARAMETERS
13

  
14
// Declare each parameter here
15
// (available widget types: Query, File, Folder, String, Text, Boolean, Integer, Float and Date)
16

  
17
@Field @Option(name="SourceFile", usage="SourceFilename", widget="File", required=false, def="input.mrg")
18
def SourceFile
19

  
20
@Field @Option(name="TargetFile", usage="TargetFilename", widget="File", required=false, def="output.xml")
21
def TargetFile
22

  
23
@Field @Option(name="XMLTargetID", usage="XMLTargetID", widget="String", required=false, def="XMLTargetID")
24
def XMLTargetID
25

  
26
@Field @Option(name="SourceFolder", usage="SourceFolder (leave empty if SourceFile set)", widget="Folder", required=false, def="input <remove if SourceFile set>")
27
def SourceFolder
28

  
29
@Field @Option(name="TargetFolder", usage="TargetFolder (leave empty if TargetFile set)", widget="Folder", required=false, def="output")
30
def TargetFolder
31

  
32
@Field @Option(name="MaximumNumberOfSentences", usage="MaximumNumberOfSentences", widget="Integer", required=true, def="0")
33
def MaximumNumberOfSentences
34

  
35
// Open the parameters input dialog box
36
if (!ParametersDialog.open(this)) return
37

  
38
// END OF PARAMETERS
39

  
40
BasicConfigurator.configure()
41

  
42
ImportFilter filter = new MyUPennFilter()
43
ImportFilterHandler handler = new SilentImportFilterHandler()
44
filter.setImportFilterHandler(handler)
45

  
46
filter.setCompression(false)
47
filter.setSchemaFilename(System.getProperty("user.home")+"/TXM/xml/xsd/tiger/TigerXML.xsd")
48
filter.setMaximumNumberOfSentences(MaximumNumberOfSentences-1)
49

  
50
if (SourceFolder == null) {
51

  
52
	def SourceFilename = SourceFile.getAbsolutePath()
53
	def TargetFilename = TargetFile.getAbsolutePath()
54

  
55
	filter.setSourceFilename(SourceFilename)
56
	filter.setXMLTargetFilename(TargetFilename)
57
	filter.setXMLTargetID(XMLTargetID)
58
	println "Converting '$SourceFilename' to '$TargetFilename'"
59

  
60
	try {
61
  			filter.startConversion()
62
		} catch (Exception e)
63
		{ e.printStackTrace()
64
	}
65
} else {
66

  
67
	for (File inputfile : SourceFolder.listFiles()) {
68
		if (inputfile.isDirectory()) continue // ignore
69
	
70
		fileNameE = inputfile.getName()
71
		i = fileNameE.lastIndexOf('.')
72
		if (i > 0) extension = fileNameE.substring(i+1) else extension = ""
73
		fileName = fileNameE.substring(0, i)
74
		fileId = fileName
75
		input = inputfile.getAbsolutePath()
76
		output = TargetFolder.getAbsolutePath()+"/"+fileName+".xml"
77
		println "Converting '"+input+"' to '"+output+"'"
78
		filter.setSourceFilename(input)
79
		filter.setXMLTargetFilename(output)
80
		filter.setXMLTargetID(fileId)
81

  
82
		try {
83
  				filter.startConversion()
84
			} catch (Exception e)
85
			{ e.printStackTrace()
86
    	}
87
	}
88
}
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/PennTreebank2TIGERMacro.readme (revision 3950)
1
PennTreebank2TIGER macro
2
========================
3
2016-07, sheiden
4

  
5
* Install
6

  
7
- unzip PennTreebank2TIGER.zip
8
- copy TigerXML.xsd in $HOME/TXM/xml/xsd/tiger/TigerXML.xsd
9
- copy PennTreebank2TIGERMacro.groovy in $HOME/TXM/scripts/macro/org/txm/macro
10
- refresh macro view
11

  
12
* Parameters
13

  
14
- SourceFile: drives one file conversion process mode
15
- SourceFolder: drives several files in a folder conversion process mode
16
-> fill only the field to be used
17
- TargetFile: if SourceFile then use that target file
18
- XMLTargetID: if SourceFile then use that target ID
19
- TargetFolder: : if SourceFolder then use that target folder to save targets
20
 - target files are named from the source filename with the extension stripped and '.xml' added
21
 - target ID are named from the source filename with the extension stripped
22
- MaximumNumberOfSentences: maximum number of sentences to convert per file (set to '0' to convert all sentences)
23

  
TXM/trunk/bundles/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/TigerXML.xsd (revision 3950)
1
<?xml version="1.0"?>
2

  
3
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
4

  
5
 <!-- ==================================================================
6
      XML Schema for the TIGER-XML format
7
      http://www.ims.uni-stuttgart.de/projekte/TIGER/public/TigerXML.xsd
8
      ==================================================================
9
      TIGER Project, Wolfgang Lezius
10
      IMS, University of Stuttgart, 04/01/2003
11
      ================================================================== -->
12

  
13

  
14
  <!-- ======================================================
15
       INCLUDES DECLARATION OF THE HEADER
16
       ====================================================== -->
17
  <xsd:include schemaLocation="TigerXMLHeader.xsd" />
18

  
19

  
20
  <!-- ======================================================
21
       INCLUDES DECLARATION OF SUBCORPORA AND SENTENCES
22
       ====================================================== -->
23
  <xsd:include schemaLocation="TigerXMLSubcorpus.xsd" />
24

  
25

  
26
  <!-- ======================================================
27
       DECLARATION OF THE CORPUS DOCUMENT
28
       ====================================================== -->
29

  
30
  <!-- declaration of the root element: corpus -->
31

  
32
  <xsd:element name="corpus">
33
  
34
    <xsd:complexType>
35

  
36
      <xsd:sequence>
37

  
38
        <xsd:choice>           
39
           <!-- header of the document is optional -->
40
           <xsd:element name="head" type="headType" minOccurs="0" maxOccurs="1"/>
41
        </xsd:choice>
42

  
43
        <xsd:element name="body" type="bodyType" minOccurs="1" maxOccurs="1"/>
44

  
45
      </xsd:sequence>
46

  
47
      <!-- corpus ID -->
48
      <xsd:attribute name="id"      type="idType"     use="required" />
49

  
50
      <!-- optional attribute: TigerXML version; used by TIGERSearch only -->
51
      <xsd:attribute name="version" type="xsd:string" use="optional" />
52

  
53
    </xsd:complexType>
54
  
55
  </xsd:element>
56

  
57

  
58
  <!-- declaration of the body type -->
59

  
60
  <xsd:complexType name="bodyType">
61

  
62
    <xsd:choice minOccurs="1" maxOccurs="unbounded">
63
      <xsd:element name="subcorpus" type="subcorpusType" minOccurs="1" maxOccurs="1"/>
64
      <xsd:element name="s" type="sentenceType" minOccurs="1" maxOccurs="1"/>
65
    </xsd:choice>
66

  
67
  </xsd:complexType>
68

  
69

  
70
</xsd:schema>
0 71

  

Formats disponibles : Unified diff