Revision 2769

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/export/ExportTextContentMacro.groovy (revision 2769)
74 74
		if (breaks_pos.contains(p)) words[j] = words[j] +"\n" 
75 75
	}
76 76
	writer.println LangFormater.format(StringUtils.join(words, " "),
77
				corpus.getLanguage());
77
				corpus.getLang());
78 78
	writer.close();
79 79
}  
80 80

  
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/conversion/EuroPressToXML2018Macro.groovy (revision 2769)
5 5
import java.text.DecimalFormat
6 6
import org.txm.utils.xml.DomUtils
7 7
import org.txm.importer.ValidateXml
8
import org.w3c.tidy.Tidy
9 8
import groovy.util.XmlParser
10 9
import org.kohsuke.args4j.*
11 10
import groovy.transform.Field
......
14 13
import org.jsoup.nodes.Document.OutputSettings.Syntax
15 14

  
16 15
// README
17
// This macros needs the following libraries: jsoup-1.11.3.jar and jtidy
16
// This macros needs the following libraries: jsoup-1.11.3.jar
18 17

  
19 18
// BEGINNING OF PARAMETERS
20 19

  
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/debug/PreferencesMacro.groovy (revision 2769)
7 7
import org.txm.core.preferences.TXMPreferences
8 8
import org.txm.searchengine.cqp.CQPPreferences
9 9

  
10
//org.txm.core.preferences.TXMPreferences.dump();
10
org.txm.core.preferences.TXMPreferences.dump();
11 11

  
12
println TXMPreferences.getString(CQPPreferences.CQI_SERVER_PATH_TO_CQPLIB, CQPPreferences.PREFERENCES_NODE);
12
//println CQPPreferences.getInstance().getProperties()
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/nlp/TT2XMLInDirectoryMacro.groovy (revision 2769)
26 26
	
27 27
	XMLOutputFactory factory = XMLOutputFactory.newInstance()
28 28
	
29
	for (File inputfile : inputDirectory.listFiles().sort()) {
29
	def files = inputDirectory.listFiles()
30
	if (files == null || files.length == 0) {
31
		println "Error: no file to process in $inputDirectory"
32
		return false;
33
	}
34
	for (File inputfile : files.sort()) {
30 35
		if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".tt")) continue // ignore
31 36
		println " file: "+inputfile
32 37
	
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/csv/Table2CorpusMacro.groovy (revision 2769)
1
package org.txm.macro.csv
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
12
		File inputFile;
13

  
14
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory")
15
		File outputDirectory;
16

  
17
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire")
18
		def textSelector;
19

  
20
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet")
21
		def structureSelector;
22

  
23
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice")
24
		def textIDColumn;
25

  
26
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale")
27
		def joinColumn;
28

  
29
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
30
		def textMetadataColumnList;
31

  
32
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
33
		def textContentColumnList;
34

  
35
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div")
36
		def structureTag;
37

  
38
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
39
		def structureMetadataColumnList;
40

  
41
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
42
		def structureContentColumnList;
43

  
44
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
45
		def metadataDateColumnList;
46
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
47
		def prefixContentColumnList;
48
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
49
		def listContentColumnList;
50

  
51

  
52
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
53
//		def structureOrderColumn;
54

  
55
if (!ParametersDialog.open(this)) return;
56

  
57
textMetadataColumnList = textMetadataColumnList.split(",")
58
textContentColumnList = textContentColumnList.split(",")
59
structureMetadataColumnList = structureMetadataColumnList.split(",")
60
structureContentColumnList = structureContentColumnList.split(",")
61
metadataDateColumnList = metadataDateColumnList.split(",")
62
prefixContentColumnList = prefixContentColumnList.split(",")
63
listContentColumnList = listContentColumnList.split(",")
64
int ti = textSelector.indexOf("=")
65
String p1 = textSelector.substring(0, ti)
66
String p2 = textSelector.substring(ti+1)
67
textSelector = [p1, p2]
68
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)]
69

  
70

  
71
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn"
72
println "textMetadataColumnList columns: $textMetadataColumnList"
73
println "textContentColumnList columns: $textContentColumnList"
74
println "structureMetadataColumnList columns: $structureMetadataColumnList"
75
println "structureContentColumnList columns: $structureContentColumnList"
76

  
77
println "text selector="+textSelector
78
println "structure selector="+structureSelector
79
println "structureTag="+structureTag
80

  
81
TableReader reader = new TableReader(inputFile);
82
if (!reader.readHeaders()) {
83
	println "Error: no header"
84
	return
85
}
86
def headers = Arrays.asList(reader.getHeaders())
87
println "table columns: $headers"
88

  
89
def ok = true
90
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList,
91
	"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList,
92
	"structureContentColumnList":structureContentColumnList]) {
93
	for (String m : list.value) {
94
		m = m.trim()
95
		if (!headers.contains(m)) {
96
			println "Error: missing ${list.key} column: $m"
97
			ok = false
98
		}
99
	}
100
}
101
if (!ok) { return; }
102

  
103
// group by text
104
def texts = new LinkedHashMap()
105
def nRecord = 0
106
while (reader.readRecord()) {
107
	nRecord++
108
	//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0])
109

  
110
	String id = reader.get(textIDColumn).trim()
111
	String join = reader.get(joinColumn).trim()
112
	String textSelectorValue = reader.get(textSelector[0]).trim()
113
	String structureSelectorValue = reader.get(structureSelector[0]).trim()
114
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
115
		if (!texts.containsKey(id)) texts[id] = []
116
		texts[id].add(0, reader.getRecord())
117
	} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
118
		if (!texts.containsKey(join)) texts[join] = []
119
		texts[join].add(reader.getRecord())
120
	} else {
121
		// ignore
122
	}
123
}
124
println "N lines: "+nRecord
125
println "N groups: "+texts.size()
126
if (texts.size()  == 0) {
127
	println "No text found. Aborting."
128
	return
129
}
130
outputDirectory.mkdir()
131

  
132
for (def id : texts.keySet()) {
133
	def toWrite = texts[id]
134
	def text = toWrite[0]
135
	String textSelectorValue = text.get(textSelector[0]).trim()
136
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
137
		println "Processing text: $id"
138

  
139
		File outputfile = new File(outputDirectory, id+".xml")
140
		XMLOutputFactory factory = XMLOutputFactory.newInstance()
141
		FileOutputStream output = new FileOutputStream(outputfile)
142
		XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
143

  
144
		writer.writeStartDocument("UTF-8","1.0")
145
		writer.writeCharacters("\n") // simple XML formating
146
		writer.writeStartElement("TEI")
147
		writer.writeCharacters("\n") // simple XML formating
148
		writer.writeStartElement("teiHeader")
149
		writer.writeEndElement() // teiHeader
150
		writer.writeCharacters("\n") // simple XML formating
151
		writer.writeStartElement("text")
152
		writer.writeAttribute("id", id)
153

  
154
		for (String att : textMetadataColumnList) {
155
			if (att in metadataDateColumnList) {
156
				writeMetadataDate(text, att, writer)
157
			} else {
158
				writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct
159
			}
160
		}
161

  
162
		writer.writeCharacters("\n") // simple XML formating
163

  
164
		writer.writeStartElement("metadata")
165
		writer.writeStartElement("list")
166
		writer.writeAttribute("type", "unordered")
167
		writer.writeCharacters("\n")
168
		textMetadataColumnList.each { att ->
169
			writer.writeStartElement("item")
170
			writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
171
			writer.writeEndElement() // item
172
			writer.writeCharacters("\n")
173
		}
174
		writer.writeEndElement() // list
175
		writer.writeEndElement() // metadata
176
		writer.writeCharacters("\n")
177

  
178
		for (String att : textContentColumnList) {
179
			if (att in prefixContentColumnList) {
180
				writeContentPrefix(text, att, writer)
181
			} else if (att in listContentColumnList) {
182
				writeContentList(text, att, writer)
183
			} else {
184
				writer.writeStartElement("p");
185
				writer.writeAttribute("type", att.trim())
186
				writer.writeCharacters("\n") // simple XML formating
187

  
188
				writer.writeStartElement("head")
189
				writer.writeStartElement("hi")
190
				writer.writeCharacters(att+" : ")
191
				writer.writeEndElement() // hi
192
				writer.writeEndElement() // head
193

  
194
				writer.writeCharacters(text.get(att)) // get textColumnList content
195
				writer.writeEndElement() // t
196
				writer.writeCharacters("\n") // simple XML formating)
197
			}
198
		}
199

  
200
		int pb_n = 1;
201
		for (int i = 1 ; i < toWrite.size() ; i++) {
202
			def record = toWrite[i]
203

  
204
			writer.writeEmptyElement("pb") // <pb/>
205
			writer.writeAttribute("n", ""+pb_n++)
206

  
207
			writer.writeStartElement(structureTag)
208

  
209
			for (String att : structureMetadataColumnList) {
210
				if (att in metadataDateColumnList) {
211
					writeMetadataDate(record, att, writer)
212
				} else {
213
					writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct
214
				}
215
			}
216
			writer.writeCharacters("\n")
217

  
218
			writer.writeStartElement("metadata")
219
			writer.writeStartElement("list")
220
			writer.writeAttribute("type", "unordered")
221
			writer.writeCharacters("\n")
222
			structureMetadataColumnList.each { att ->
223
				writer.writeStartElement("item")
224
				writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
225
				writer.writeEndElement() // item
226
				writer.writeCharacters("\n")
227
			}
228
			writer.writeEndElement() // list
229
			writer.writeEndElement() // metadata
230
			writer.writeCharacters("\n")
231

  
232
			for (String att : structureContentColumnList) {
233
				if (att in prefixContentColumnList) {
234
					writeContentPrefix(text, att, writer)
235
				} else if (att in listContentColumnList) {
236
					writeContentList(text, att, writer)
237
				} else {
238
					writer.writeStartElement("p");
239
					writer.writeAttribute("type", att.trim())
240
					writer.writeAttribute("id", record.get(textIDColumn))
241
					writer.writeCharacters("\n") // simple XML formating
242

  
243
					writer.writeStartElement("head")
244
					writer.writeStartElement("hi")
245
					writer.writeCharacters(att+" : ")
246
					writer.writeEndElement() // hi
247
					writer.writeEndElement() // head
248

  
249
					writer.writeCharacters(record.get(att)) // get textColumnList content
250
					writer.writeEndElement() // t
251
					writer.writeCharacters("\n") // simple XML formating
252
				}
253
			}
254

  
255
			writer.writeEndElement() // struct
256
			writer.writeCharacters("\n") // simple XML formating
257
		}
258

  
259
		writer.writeEndElement() // text
260
		writer.writeCharacters("\n") // simple XML formating
261
		writer.writeEndElement() // TEI
262
		writer.close()
263
		output.close()
264
		reader.close()
265
	} else {
266
		// error
267
		println "ERROR: '$id' text group with  no text line"
268
	}
269
}
270

  
271
def writeContentList(def record, def att, def writer) {
272
	writer.writeCharacters("\n")
273
	found = false
274
	def value = record.get(att)
275
	value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
276
		found = true
277
		writer.writeCharacters("\t")
278
		writer.writeStartElement("p")
279
		writer.writeAttribute("rend", "list")
280
		matches = (desc =~ /(?s)([^\n]+?)[\n]/)
281
		writer.writeCharacters(matches[0][1])
282
		writer.writeEndElement() // p
283
		writer.writeCharacters("\n")
284
	}
285
	if (!found) {
286
		writer.writeCharacters("\t")
287
		writer.writeStartElement("p")
288
		writer.writeAttribute("rend", "no-list")
289
		writer.writeCharacters(value)
290
		writer.writeEndElement() // p
291
		writer.writeCharacters("\n")
292
	}
293
}
294

  
295
def writeContentPrefix(def record, def att, def writer) {
296
	writer.writeCharacters("\n")
297
	writer.writeStartElement("list")
298
	writer.writeAttribute("rend", "prefixes")
299
	writer.writeAttribute("type", "unordered")
300
	writer.writeCharacters("\n")
301
	found = false
302
	def value = record.get(att)
303
	value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
304
		found = true
305
		writer.writeCharacters("\t")
306
		writer.writeStartElement("item")
307
		matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
308
		writer.writeAttribute("type", matches[0][1])
309
		writer.writeStartElement("span")
310
		writer.writeCharacters(matches[0][1]+" ")
311
		writer.writeEndElement() // span
312
		writer.writeCharacters(matches[0][2])
313
		writer.writeEndElement() // item
314
		writer.writeCharacters("\n")
315
	}
316
	if (!found) {
317
		writer.writeCharacters(value)
318
	}
319
	writer.writeEndElement() // list
320
	writer.writeCharacters("\n")
321
}
322

  
323
def writeMetadataDate(def record, def att, def writer) {
324
	String value = record.get(att)
325
	String att_normalized = AsciiUtils.buildAttributeId(att)
326
	matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
327
	writer.writeAttribute(att_normalized+"jour", matches[0][1])
328
	writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
329
	writer.writeAttribute(att_normalized+"mois", matches[0][2])
330
	writer.writeAttribute(att_normalized+"annee", matches[0][3])
331
	writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1])
332
}
333

  
334
/*
335
 String name = inputFile.getName()
336
 int idx = name.lastIndexOf(".")
337
 if (idx > 0) name = name.substring(0, idx)
338
 */
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/imports/OpenImportFormMacro.groovy (revision 2769)
4 4
import groovy.transform.Field
5 5
import org.txm.rcp.swt.widget.parameters.*
6 6
import org.txm.rcp.commands.*
7
import org.txm.Toolbox
7 8

  
8 9
@Field @Option(name="scriptFile", usage="an example file", widget="File", required=true, def="xxxLoader.groovy")
9 10
File scriptFile
......
22 23
	public void run() {
23 24
		try {
24 25
			String path = script.getAbsolutePath()
25
			OpenImportForm.openfile(path);
26
			String rootpath = new File(Toolbox.getTxmHomePath(), "scripts/groovy/user/org/txm/scripts/importer")
27
	
28
			if (path.startsWith(rootpath)) {
29
				OpenImportForm.openfile(path.substring(Toolbox.getTxmHomePath().length()));
30
			} else {
31
				println "Error: import start script must be included in $rootpath"
32
			}
26 33
		} catch (Exception e) {
27 34
			e.printStackTrace();
28 35
		}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/commands/CrossedPartitionBuilderMacro.groovy (revision 2769)
8 8
import org.txm.searchengine.cqp.corpus.*
9 9
import org.txm.searchengine.cqp.corpus.Partition
10 10
import org.txm.utils.logger.Log
11
import org.txm.rcp.views.*
11
import org.txm.rcp.views.corpora.*
12 12

  
13 13
//BEGINNING OF PARAMETERS
14 14
if (!(corpusViewSelection instanceof CQPCorpus)) {
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/export/conll2009/ToCoNLL2009.groovy (revision 2769)
31 31
		def matches = corpus.query(new CQLQuery("<s> []+ </s>"), "TMPEXPORTCONNL", false).getMatches()
32 32
		if (debug) println matches.size()
33 33
		int npositions = 0;
34
		for (def match : matches) npositions += match.length +1
34
		for (def match : matches) npositions += match.size() +1
35 35
		if (debug) println "npositions= $npositions"
36 36

  
37 37
		int[] positions = new int[npositions+1]

Also available in: Unified diff