Révision 3209

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/office/Table2CorpusMacro.groovy (revision 3209)
1
package org.txm.macro.csv
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
12
		File inputFile;
13

  
14
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory")
15
		File outputDirectory;
16

  
17
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire")
18
		def textSelector;
19

  
20
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet")
21
		def structureSelector;
22

  
23
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice")
24
		def textIDColumn;
25

  
26
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale")
27
		def joinColumn;
28

  
29
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
30
		def textMetadataColumnList;
31

  
32
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
33
		def textContentColumnList;
34

  
35
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div")
36
		def structureTag;
37

  
38
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
39
		def structureMetadataColumnList;
40

  
41
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
42
		def structureContentColumnList;
43

  
44
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
45
		def metadataDateColumnList;
46
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
47
		def prefixContentColumnList;
48
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
49
		def listContentColumnList;
50

  
51

  
52
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
53
//		def structureOrderColumn;
54

  
55
if (!ParametersDialog.open(this)) return;
56

  
57
textMetadataColumnList = textMetadataColumnList.split(",")
58
textContentColumnList = textContentColumnList.split(",")
59
structureMetadataColumnList = structureMetadataColumnList.split(",")
60
structureContentColumnList = structureContentColumnList.split(",")
61
metadataDateColumnList = metadataDateColumnList.split(",")
62
prefixContentColumnList = prefixContentColumnList.split(",")
63
listContentColumnList = listContentColumnList.split(",")
64
int ti = textSelector.indexOf("=")
65
String p1 = textSelector.substring(0, ti)
66
String p2 = textSelector.substring(ti+1)
67
textSelector = [p1, p2]
68
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)]
69

  
70

  
71
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn"
72
println "textMetadataColumnList columns: $textMetadataColumnList"
73
println "textContentColumnList columns: $textContentColumnList"
74
println "structureMetadataColumnList columns: $structureMetadataColumnList"
75
println "structureContentColumnList columns: $structureContentColumnList"
76

  
77
println "text selector="+textSelector
78
println "structure selector="+structureSelector
79
println "structureTag="+structureTag
80

  
81
TableReader reader = new TableReader(inputFile);
82
if (!reader.readHeaders()) {
83
	println "Error: no header"
84
	return
85
}
86
def headers = Arrays.asList(reader.getHeaders())
87
println "table columns: $headers"
88

  
89
def ok = true
90
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList,
91
	"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList,
92
	"structureContentColumnList":structureContentColumnList]) {
93
	for (String m : list.value) {
94
		m = m.trim()
95
		if (!headers.contains(m)) {
96
			println "Error: missing ${list.key} column: $m"
97
			ok = false
98
		}
99
	}
100
}
101
if (!ok) { return; }
102

  
103
// group by text
104
def texts = new LinkedHashMap()
105
def nRecord = 0
106
while (reader.readRecord()) {
107
	nRecord++
108
	//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0])
109

  
110
	String id = reader.get(textIDColumn).trim()
111
	String join = reader.get(joinColumn).trim()
112
	String textSelectorValue = reader.get(textSelector[0]).trim()
113
	String structureSelectorValue = reader.get(structureSelector[0]).trim()
114
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
115
		if (!texts.containsKey(id)) texts[id] = []
116
		texts[id].add(0, reader.getRecord())
117
	} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
118
		if (!texts.containsKey(join)) texts[join] = []
119
		texts[join].add(reader.getRecord())
120
	} else {
121
		// ignore
122
	}
123
}
124
println "N lines: "+nRecord
125
println "N groups: "+texts.size()
126
if (texts.size()  == 0) {
127
	println "No text found. Aborting."
128
	return
129
}
130
outputDirectory.mkdir()
131

  
132
for (def id : texts.keySet()) {
133
	def toWrite = texts[id]
134
	def text = toWrite[0]
135
	String textSelectorValue = text.get(textSelector[0]).trim()
136
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
137
		println "Processing text: $id"
138

  
139
		File outputfile = new File(outputDirectory, id+".xml")
140
		XMLOutputFactory factory = XMLOutputFactory.newInstance()
141
		FileOutputStream output = new FileOutputStream(outputfile)
142
		XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
143

  
144
		writer.writeStartDocument("UTF-8","1.0")
145
		writer.writeCharacters("\n") // simple XML formating
146
		writer.writeStartElement("TEI")
147
		writer.writeCharacters("\n") // simple XML formating
148
		writer.writeStartElement("teiHeader")
149
		writer.writeEndElement() // teiHeader
150
		writer.writeCharacters("\n") // simple XML formating
151
		writer.writeStartElement("text")
152
		writer.writeAttribute("id", id)
153

  
154
		for (String att : textMetadataColumnList) {
155
			if (att in metadataDateColumnList) {
156
				writeMetadataDate(text, att, writer)
157
			} else {
158
				writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct
159
			}
160
		}
161

  
162
		writer.writeCharacters("\n") // simple XML formating
163

  
164
		writer.writeStartElement("metadata")
165
		writer.writeStartElement("list")
166
		writer.writeAttribute("type", "unordered")
167
		writer.writeCharacters("\n")
168
		textMetadataColumnList.each { att ->
169
			writer.writeStartElement("item")
170
			writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
171
			writer.writeEndElement() // item
172
			writer.writeCharacters("\n")
173
		}
174
		writer.writeEndElement() // list
175
		writer.writeEndElement() // metadata
176
		writer.writeCharacters("\n")
177

  
178
		for (String att : textContentColumnList) {
179
			if (att in prefixContentColumnList) {
180
				writeContentPrefix(text, att, writer)
181
			} else if (att in listContentColumnList) {
182
				writeContentList(text, att, writer)
183
			} else {
184
				writer.writeStartElement("p");
185
				writer.writeAttribute("type", att.trim())
186
				writer.writeCharacters("\n") // simple XML formating
187

  
188
				writer.writeStartElement("head")
189
				writer.writeStartElement("hi")
190
				writer.writeCharacters(att+" : ")
191
				writer.writeEndElement() // hi
192
				writer.writeEndElement() // head
193

  
194
				writer.writeCharacters(text.get(att)) // get textColumnList content
195
				writer.writeEndElement() // t
196
				writer.writeCharacters("\n") // simple XML formating)
197
			}
198
		}
199

  
200
		int pb_n = 1;
201
		for (int i = 1 ; i < toWrite.size() ; i++) {
202
			def record = toWrite[i]
203

  
204
			writer.writeEmptyElement("pb") // <pb/>
205
			writer.writeAttribute("n", ""+pb_n++)
206

  
207
			writer.writeStartElement(structureTag)
208

  
209
			for (String att : structureMetadataColumnList) {
210
				if (att in metadataDateColumnList) {
211
					writeMetadataDate(record, att, writer)
212
				} else {
213
					writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct
214
				}
215
			}
216
			writer.writeCharacters("\n")
217

  
218
			writer.writeStartElement("metadata")
219
			writer.writeStartElement("list")
220
			writer.writeAttribute("type", "unordered")
221
			writer.writeCharacters("\n")
222
			structureMetadataColumnList.each { att ->
223
				writer.writeStartElement("item")
224
				writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
225
				writer.writeEndElement() // item
226
				writer.writeCharacters("\n")
227
			}
228
			writer.writeEndElement() // list
229
			writer.writeEndElement() // metadata
230
			writer.writeCharacters("\n")
231

  
232
			for (String att : structureContentColumnList) {
233
				if (att in prefixContentColumnList) {
234
					writeContentPrefix(text, att, writer)
235
				} else if (att in listContentColumnList) {
236
					writeContentList(text, att, writer)
237
				} else {
238
					writer.writeStartElement("p");
239
					writer.writeAttribute("type", att.trim())
240
					writer.writeAttribute("id", record.get(textIDColumn))
241
					writer.writeCharacters("\n") // simple XML formating
242

  
243
					writer.writeStartElement("head")
244
					writer.writeStartElement("hi")
245
					writer.writeCharacters(att+" : ")
246
					writer.writeEndElement() // hi
247
					writer.writeEndElement() // head
248

  
249
					writer.writeCharacters(record.get(att)) // get textColumnList content
250
					writer.writeEndElement() // t
251
					writer.writeCharacters("\n") // simple XML formating
252
				}
253
			}
254

  
255
			writer.writeEndElement() // struct
256
			writer.writeCharacters("\n") // simple XML formating
257
		}
258

  
259
		writer.writeEndElement() // text
260
		writer.writeCharacters("\n") // simple XML formating
261
		writer.writeEndElement() // TEI
262
		writer.close()
263
		output.close()
264
		reader.close()
265
	} else {
266
		// error
267
		println "ERROR: '$id' text group with  no text line"
268
	}
269
}
270

  
271
def writeContentList(def record, def att, def writer) {
272
	writer.writeCharacters("\n")
273
	found = false
274
	def value = record.get(att)
275
	value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
276
		found = true
277
		writer.writeCharacters("\t")
278
		writer.writeStartElement("p")
279
		writer.writeAttribute("rend", "list")
280
		matches = (desc =~ /(?s)([^\n]+?)[\n]/)
281
		writer.writeCharacters(matches[0][1])
282
		writer.writeEndElement() // p
283
		writer.writeCharacters("\n")
284
	}
285
	if (!found) {
286
		writer.writeCharacters("\t")
287
		writer.writeStartElement("p")
288
		writer.writeAttribute("rend", "no-list")
289
		writer.writeCharacters(value)
290
		writer.writeEndElement() // p
291
		writer.writeCharacters("\n")
292
	}
293
}
294

  
295
def writeContentPrefix(def record, def att, def writer) {
296
	writer.writeCharacters("\n")
297
	writer.writeStartElement("list")
298
	writer.writeAttribute("rend", "prefixes")
299
	writer.writeAttribute("type", "unordered")
300
	writer.writeCharacters("\n")
301
	found = false
302
	def value = record.get(att)
303
	value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
304
		found = true
305
		writer.writeCharacters("\t")
306
		writer.writeStartElement("item")
307
		matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
308
		writer.writeAttribute("type", matches[0][1])
309
		writer.writeStartElement("span")
310
		writer.writeCharacters(matches[0][1]+" ")
311
		writer.writeEndElement() // span
312
		writer.writeCharacters(matches[0][2])
313
		writer.writeEndElement() // item
314
		writer.writeCharacters("\n")
315
	}
316
	if (!found) {
317
		writer.writeCharacters(value)
318
	}
319
	writer.writeEndElement() // list
320
	writer.writeCharacters("\n")
321
}
322

  
323
def writeMetadataDate(def record, def att, def writer) {
324
	String value = record.get(att)
325
	String att_normalized = AsciiUtils.buildAttributeId(att)
326
	matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
327
	writer.writeAttribute(att_normalized+"jour", matches[0][1])
328
	writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
329
	writer.writeAttribute(att_normalized+"mois", matches[0][2])
330
	writer.writeAttribute(att_normalized+"annee", matches[0][3])
331
	writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1])
332
}
333

  
334
/*
335
 String name = inputFile.getName()
336
 int idx = name.lastIndexOf(".")
337
 if (idx > 0) name = name.substring(0, idx)
338
 */
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/annotation/CQLList2WordPropertiesMacro.groovy (revision 3209)
29 29
@Field @Option(name="word_property", usage="The word property to annotate", widget="String", required=true, def="type")
30 30
		String word_property
31 31

  
32
@Field @Option(name="do_not_update_corpus_indexes_and_editions", usage="Select to only update the XML-TXM files", widget="Boolean", required=true, def="false")
33
		def do_not_update_corpus_indexes_and_editions
32
@Field @Option(name="update_corpus_indexes_and_editions", usage="Unselect to only update the XML-TXM files", widget="Boolean", required=true, def="true")
33
		def update_corpus_indexes_and_editions
34 34

  
35 35
// Open the parameters input dialog box
36 36
if (!ParametersDialog.open(this)) return
......
44 44
int n = gse.run(CQLList2WordAnnotationsMacro, hash)
45 45

  
46 46
if (n > 0) {
47
	println "Saving annotations..."
47
	println "Saving $n annotations..."
48 48
	//	monitor.syncExec(new Runnable() {
49 49
	//		public void run() {
50 50
	def saveJob = SaveAnnotations.save(mcorpus);
51 51
	if (saveJob == null || saveJob.getResult() == Status.CANCEL_STATUS) {
52 52
		// update editor corpus
53
		System.out.println("Fail to save annotations of the corpus."); //$NON-NLS-1$
53
		System.out.println("** Error: failed to save the annotations of the corpus."); //$NON-NLS-1$
54 54
		return;
55 55
	} else {
56
		if (!do_not_update_corpus_indexes_and_editions) {
56
		if (update_corpus_indexes_and_editions) {
57
			println "Updating corpus indexes and editions..."
57 58
			UpdateCorpus.update(mcorpus)
58 59
		}
59 60
		
......
64 65
} else {
65 66
	println "No annotation to save."
66 67
	return false
67
}
68
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/annotation/WordProperty2WordMacro.groovy (revision 3209)
1
// Copyright © 2021 ENS Lyon
2
// Licensed under the terms of the GNU General Public License version 3 (http://www.gnu.org/licenses/gpl-3.0.html)
3
// @author mdecorde
4
// @author sheiden
1 5

  
2 6
package org.txm.macro.annotation
3 7

  
......
28 32
Project project = mcorpus.getProject()
29 33
File txmDir = new File(project.getProjectDirectory(), "txm/"+mcorpus.getName())
30 34

  
31
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="result_directory")
35
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="")
32 36
File outputDirectory
33 37

  
34
@Field @Option(name="word_property", usage="The word property to move", widget="String", required=true, def="type")
38
@Field @Option(name="word_property", usage="The word property to project", widget="String", required=true, def="type")
35 39
String word_property
36 40

  
37
//@Field @Option(name="word_property_value_to_ignore", usage="The annotation type", widget="String", required=true, def="__undef__")
38
//String word_property_value_to_ignore
41
@Field @Option(name="values_to_ignore_regex", usage="regex of values not to project", widget="String", required=true, def="")
42
String values_to_ignore_regex
39 43

  
44
@Field @Option(name="debug", usage="Afficher les messages de debug", widget="Boolean", required=true, def="false")
45
def debug
46

  
40 47
// Open the parameters input dialog box
41 48
if (!ParametersDialog.open(this)) return
42 49

  
......
45 52
	return false
46 53
}
47 54

  
55
values_to_ignore_regex = /$values_to_ignore_regex/
56

  
57
project.compute();
58

  
48 59
for (Text text : project.getChildren(Text.class)) {
49 60
	println "Text: "+text
50 61
	
51 62
	File orig = text.getXMLTXMFile()
52
	if (!orig.exists()) {
53
		println "Warning: no XML-TXM file found for "+text
54
		continue // next !
63
	if ((orig == null) || !orig.exists()) {
64
		println "** Error: no XML-TXM file found for "+text
65
		return // next !
55 66
	}
56 67
	
57 68
	File result = new File(outputDirectory, orig.getName())
......
62 73
	}
63 74
	
64 75
	XMLProcessor xp = new XMLProcessor(orig);
65
	LocalNameHookActivator activator = new LocalNameHookActivator("w");
76
	LocalNameHookActivator activator = new LocalNameHookActivator("w")
66 77
	new DOMIdentityHook("word", activator, xp) {
67 78
		@Override
68 79
		public void processDom() {
69 80
			//println dom
70 81
			use(groovy.xml.dom.DOMCategory) {
71 82
				def form = dom.form[0]
72
				def anatype = dom.ana.findAll(){ a -> a['@type'] == "#$word_property"}
83
				def anatype = dom.ana.findAll(){ a ->
84
					if (debug && (a['@type'] == "#$word_property")) {
85
						println 'a["@type"] = '+a["@type"]
86
						println 'a.text() = '+a.text()
87
						println 'a["@type"] == "#$word_property"'+" = "+(a['@type'] == "#$word_property")
88
						println "values_to_ignore_regex = "+values_to_ignore_regex
89
						println '!(a.text() ==~ values_to_ignore_regex))'+" = "+(!(a.text() ==~ values_to_ignore_regex))
90
						println '((a["@type"] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) = '+((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex))
91
					}
92
					return ((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex))
93
				}
73 94
//				println form.text()+" -> $anatype"
74 95
				if (anatype.size() > 0) { // keep the word
75 96
					anatype = anatype[0]
......
78 99
					anatype['@type'] = "#form"
79 100
					anatype.setTextContent(s)
80 101
				} else {
81
					dom = null;
102
					dom = null
82 103
				}
83 104
			}
84 105
		}
85 106
	}
86 107
	xp.process(result)
87
}
108
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro.groovy (revision 3209)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6

  
7
import org.txm.libs.msoffice.ReadExcel
8
import org.txm.rcpapplication.swt.widget.parameters.*
9
import org.txm.utils.*
10
import javax.xml.stream.*
11
import java.net.URL
12
import java.text.SimpleDateFormat
13

  
14
import org.apache.poi.ss.usermodel.*
15
import org.apache.poi.hssf.usermodel.*
16
import org.apache.poi.xssf.usermodel.*
17
import org.apache.poi.ss.usermodel.WorkbookFactory
18
import org.apache.poi.ss.util.*
19

  
20
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
21

  
22
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
23
		File inputFile
24

  
25
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
26
		def sheetName
27

  
28
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
29
		def rootTag
30

  
31
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
32
		def textTag
33

  
34
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
35
		def metadataColumnList
36

  
37
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
38
		def dateColumnList
39

  
40
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
41
		def textColumnList
42

  
43
@Field @Option(name="EmbedInTEI", usage="text columns list separated by comma", widget="Boolean", required=false, def="false")
44
		def EmbedInTEI
45

  
46
if (!ParametersDialog.open(this)) return
47

  
48
	if (!inputFile.exists()) {
49
		println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
50
		return false
51
	}
52

  
53
if (!inputFile.canRead()) {
54
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
55
	return false
56
}
57

  
58
try {
59
	
60
	metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
61
	dateColumnList = dateColumnList.split(",").collect { it.trim() }
62
	textColumnList = textColumnList.split(",").collect { it.trim() }
63
	textTag = textTag.trim()
64
	rootTag = rootTag.trim()
65
	
66
	ReadExcel excel = new ReadExcel(inputFile, sheetName);
67
	excel.readHeaders();
68
	
69
	headers = excel.getHeaders()
70
	
71
	println "Headers: $headers"
72
	
73
	ok = true
74
	metadataColumnList.each { m ->
75
		if (!headers.contains(m)) {
76
			println "** Excel2XML: missing metadataColumnList column: $m"
77
			ok = false
78
		}
79
	}
80
	textColumnList.each { t ->
81
		if (!headers.contains(t)) {
82
			println "** Excel2XML: missing textColumnList column: $t"
83
			ok = false
84
		}
85
	}
86
	
87
	if (!ok) { return false }
88
	
89
	println "metadataColumnList = "+metadataColumnList
90
	println "dateColumnList = "+dateColumnList
91
	println "textColumnList = "+textColumnList
92
	
93
	name = inputFile.getName()
94
	idx = name.lastIndexOf(".")
95
	
96
	if (idx > 0) name = name.substring(0, idx)
97
	outputFile = new File(inputFile.getParentFile(), name+".xml")
98
	
99
	factory = XMLOutputFactory.newInstance()
100
	output = new FileOutputStream(outputFile)
101
	writer = factory.createXMLStreamWriter(output, "UTF-8")
102
	
103
	writer.writeStartDocument("UTF-8","1.0")
104
	writer.writeCharacters("\n") // simple XML formating
105
	
106
	if (EmbedInTEI) {
107
		writer.writeStartElement("TEI")
108
		writer.writeStartElement("teiHeader")
109
		writer.writeEndElement() // teiHeader
110
		writer.writeStartElement("text")
111
		writer.writeCharacters("\n")
112
	}
113
	
114
	writer.writeStartElement(rootTag)
115
	writer.writeCharacters("\n")
116
	
117
	pb_n = 1
118
	
119
	//(1..nRows-1).each { rowIndex ->
120
	def record = null
121
	while (excel.readRecord()) {
122
		
123
		record = excel.getRecord()
124
		if (record.isEmpty()) continue;
125
		
126
		//println "record=$record"
127
		
128
		writer.writeCharacters("  ")
129
		writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
130
		writer.writeAttribute("n", ""+pb_n++)
131
		writer.writeCharacters("\n") // simple XML formating
132
		
133
		writer.writeCharacters("  ")
134
		writer.writeStartElement(textTag)
135
		//metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
136
		for (def metadataName : metadataColumnList) {
137
			String s = record.get(metadataName);
138
			if (s == null) s = "";
139
			
140
			value = s.replaceAll("\n", ";").trim()
141
			writer.writeAttribute(stringToIndent(metadataName), value)
142
			
143
			if (metadataName in dateColumnList) { // also split date attributes in day+month+year attributes
144
				matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
145
				writer.writeAttribute(metadataName+"-jour", matches[0][1])
146
				writer.writeAttribute(metadataName+"-joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
147
				writer.writeAttribute(metadataName+"-mois", matches[0][2])
148
				writer.writeAttribute(metadataName+"-annee", matches[0][3])
149
			}
150
		}
151
		writer.writeCharacters("\n")
152
		
153
		writer.writeCharacters("    ")
154
		writer.writeStartElement("metadata")
155
		writer.writeStartElement("list")
156
		writer.writeAttribute("type", "unordered")
157
		writer.writeCharacters("\n")
158
		
159
		//metadataColumnIndex.each { colIndex ->
160
		for (def metadataName : metadataColumnList) {
161
			writer.writeStartElement("item")
162
			writer.writeCharacters(metadataName+" : "+record.get(metadataName).replaceAll("\n", ";"))
163
			writer.writeEndElement() // item
164
			writer.writeCharacters("\n")
165
		}
166
		writer.writeCharacters("    ")
167
		writer.writeEndElement() // list
168
		writer.writeEndElement() // head
169
		writer.writeCharacters("\n")
170
		
171
		//textColumnIndex.each { colIndex ->
172
		for (def textColumnName : textColumnList) {
173
			
174
			writer.writeCharacters("    ")
175
			writer.writeStartElement(stringToIndent(textColumnName))
176
			writer.writeStartElement("p")
177
			writer.writeStartElement("head")
178
			writer.writeStartElement("hi")
179
			writer.writeCharacters(textColumnName+" : ")
180
			writer.writeEndElement() // hi
181
			writer.writeEndElement() // head
182
			
183
			value = record.get(textColumnName)
184
				
185
			if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
186
				value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
187
					writer.writeStartElement("descripteur")
188
					matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
189
					writer.writeAttribute("type", matches[0][1])
190
					writer.writeCharacters(matches[0][2])
191
					writer.writeEndElement() // descripteur
192
				}
193
			} else {
194
				writer.writeCharacters(value)
195
			}
196
			writer.writeEndElement() // p
197
			writer.writeEndElement() // textColumn
198
			writer.writeCharacters("\n")
199
		}
200
		
201
		writer.writeCharacters("  ")
202
		writer.writeEndElement() // textTag
203
		writer.writeCharacters("\n")
204
	}
205
	
206
	writer.writeEndElement() // rootTag
207
	writer.writeCharacters("\n")
208
	
209
	if (EmbedInTEI) {
210
		writer.writeEndElement() // text
211
		writer.writeCharacters("\n")
212
		writer.writeEndElement() // TEI
213
		writer.writeCharacters("\n")
214
	}
215
	
216
	writer.close()
217
	output.close()
218
	println "Result file: $outputFile"
219
	
220
} catch (Exception e) {
221
	println "** Excel2XML: unable to read input file. Aborting."
222
	println e.getLocalizedMessage()
223
	println e.printStackTrace()
224
	return false
225
}
226

  
227
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Excel2XMLMacro2.groovy (revision 3209)
1
package org.txm.macro.table
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6

  
7
import org.txm.libs.msoffice.ReadExcel
8
import org.txm.rcpapplication.swt.widget.parameters.*
9
import org.txm.utils.*
10
import javax.xml.stream.*
11
import java.net.URL
12
import java.text.SimpleDateFormat
13

  
14
import org.apache.poi.ss.usermodel.*
15
import org.apache.poi.hssf.usermodel.*
16
import org.apache.poi.xssf.usermodel.*
17
import org.apache.poi.ss.usermodel.WorkbookFactory
18
import org.apache.poi.ss.util.*
19

  
20
def stringToIndent = { str -> org.txm.utils.AsciiUtils.buildAttributeId(org.txm.utils.AsciiUtils.convertNonAscii(str)).toLowerCase() }
21

  
22
@Field @Option(name="inputFile", usage="fichier Excel à traiter", widget="File", required=true, def="")
23
		File inputFile
24

  
25
@Field @Option(name="sheetName", usage="sheet name (if no name is given the first sheet will be used)", widget="String", required=false, def="")
26
		def sheetName
27

  
28
@Field @Option(name="EmbedInTEI", usage="embed xml content in  TEI text element", widget="Boolean", required=false, def="false")
29
		def EmbedInTEI
30
		
31
@Field @Option(name="rootTag", usage="root tag name", widget="String", required=false, def="root")
32
		def rootTag
33

  
34
@Field @Option(name="textTag", usage="line unit tag name", widget="String", required=false, def="unit")
35
		def textTag
36

  
37
@Field @Option(name="metadataColumnList", usage="metadata columns list separated by comma", widget="String", required=false, def="meta1,meta2")
38
		def metadataColumnList
39

  
40
@Field @Option(name="dateColumnList", usage="date columns list separated by comma", widget="String", required=false, def="meta1,meta2")
41
		def dateColumnList
42

  
43
@Field @Option(name="textColumnList", usage="text columns list separated by comma", widget="String", required=false, def="textColumnList1,textColumnList2")
44
		def textColumnList
45
		
46
@Field @Option(name="prefixContentColumnList", usage="columns which content is a semi-colon separated prefixes list", widget="String", required=false, def="")
47
		def prefixContentColumnList
48
		
49
@Field @Option(name="listContentColumnList", usage="columns which content is a semi-colon separated list", widget="String", required=false, def="")
50
		def listContentColumnList
51

  
52

  
53
if (!ParametersDialog.open(this)) return
54

  
55
	if (!inputFile.exists()) {
56
		println "** Excel2XML: no '"+inputFile.name+"' file found. Aborting."
57
		return false
58
	}
59

  
60
if (!inputFile.canRead()) {
61
	println "** Excel2XML: '"+inputFile.name+"' file not readable. Aborting."
62
	return false
63
}
64

  
65
try {
66
	metadataColumnList = metadataColumnList.split(",").collect { it.trim() }
67
	dateColumnList = dateColumnList.split(",").collect { it.trim() }
68
	textColumnList = textColumnList.split(",").collect { it.trim() }
69
	prefixContentColumnList = prefixContentColumnList.split(",").collect { it.trim() }
70
	listContentColumnList = listContentColumnList.split(",").collect { it.trim() }
71
	textTag = textTag.trim()
72
	rootTag = rootTag.trim()
73
	
74
	ReadExcel excel = new ReadExcel(inputFile, sheetName);
75
	excel.readHeaders();
76
	
77
	headers = excel.getHeaders()
78
	
79
	println "Headers: $headers"
80
	
81
	ok = true
82
	metadataColumnList.each { m ->
83
		if (!headers.contains(m)) {
84
			println "** Excel2XML: missing metadataColumnList column: $m"
85
			ok = false
86
		}
87
	}
88
	textColumnList.each { t ->
89
		if (!headers.contains(t)) {
90
			println "** Excel2XML: missing textColumnList column: $t"
91
			ok = false
92
		}
93
	}
94
	
95
	if (!ok) { return false }
96
	
97
	println "metadataColumnList = "+metadataColumnList
98
	println "dateColumnList = "+dateColumnList
99
	println "textColumnList = "+textColumnList
100
	
101
	name = inputFile.getName()
102
	idx = name.lastIndexOf(".")
103
	
104
	if (idx > 0) name = name.substring(0, idx)
105
	outputFile = new File(inputFile.getParentFile(), name+".xml")
106
	
107
	factory = XMLOutputFactory.newInstance()
108
	output = new FileOutputStream(outputFile)
109
	writer = factory.createXMLStreamWriter(output, "UTF-8")
110
	
111
	writer.writeStartDocument("UTF-8","1.0")
112
	writer.writeCharacters("\n") // simple XML formating
113
	
114
	if (EmbedInTEI) {
115
		writer.writeStartElement("TEI")
116
		writer.writeStartElement("teiHeader")
117
		writer.writeEndElement() // teiHeader
118
		writer.writeStartElement("text")
119
		writer.writeCharacters("\n")
120
	}
121
	
122
	if (rootTag != null && rootTag.length() > 0) {
123
		writer.writeStartElement(rootTag)
124
		writer.writeCharacters("\n")
125
	}
126
	
127
	pb_n = 1
128
	
129
	//(1..nRows-1).each { rowIndex ->
130
	def record = null
131
	while (excel.readRecord()) {
132
		
133
		record = excel.getRecord()
134
		if (record.isEmpty()) continue;
135
		
136
		//println "record=$record"
137
		
138
		writer.writeCharacters("  ")
139
		writer.writeEmptyElement("pb") // <pb/> to get one page per input line (don't forget high number of words per page in import module)
140
		writer.writeAttribute("n", ""+pb_n++)
141
		writer.writeCharacters("\n") // simple XML formating
142
		
143
		writer.writeCharacters("  ")
144
		if (textTag != null && textTag.length() > 0) {
145
			writer.writeStartElement(textTag)
146
			
147
			//metadataColumnIndex.each { colIndex -> // build an attribute for each metadata
148
			for (def metadataName : metadataColumnList) {
149
				String s = record.get(metadataName);
150
				if (s == null) s = "";
151
				
152
				value = s.replaceAll("\n", ";").trim()
153
				writer.writeAttribute(stringToIndent(metadataName), value)
154
				
155
				if (metadataName in dateColumnList) { // also split date attributes in day+month+year attributes
156
					matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
157
					writer.writeAttribute(metadataName+"-jour", matches[0][1])
158
					writer.writeAttribute(metadataName+"-joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
159
					writer.writeAttribute(metadataName+"-mois", matches[0][2])
160
					writer.writeAttribute(metadataName+"-annee", matches[0][3])
161
				}
162
			}
163
			writer.writeCharacters("\n")
164
		}
165
		
166
		writer.writeCharacters("    ")
167
		writer.writeStartElement("metadata")
168
		writer.writeStartElement("list")
169
		writer.writeAttribute("type", "unordered")
170
		writer.writeCharacters("\n")
171
		
172
		//metadataColumnIndex.each { colIndex ->
173
		for (def metadataName : metadataColumnList) {
174
			writer.writeStartElement("item")
175
			writer.writeCharacters(metadataName+" : "+record.get(metadataName).replaceAll("\n", ";"))
176
			writer.writeEndElement() // item
177
			writer.writeCharacters("\n")
178
		}
179
		writer.writeCharacters("    ")
180
		writer.writeEndElement() // list
181
		writer.writeEndElement() // head
182
		writer.writeCharacters("\n")
183
		
184
		//textColumnIndex.each { colIndex ->
185
		for (def textColumnName : textColumnList) {
186
			
187
			writer.writeCharacters("    ")
188
			writer.writeStartElement(stringToIndent(textColumnName))
189
			writer.writeStartElement("p")
190
			writer.writeStartElement("head")
191
			writer.writeStartElement("hi")
192
			writer.writeCharacters(textColumnName+" : ")
193
			writer.writeEndElement() // hi
194
			writer.writeEndElement() // head
195
			
196
			value = record.get(textColumnName)
197
			
198
			if (value ==~ /(?s)^[A-Z]{3}:  [^;\n]+? +[;\n].*/) {
199
				value.findAll( /(?s)[A-Z]{3}:  ([^;\n]+?) +[;\n]/ ).each { desc ->
200
					writer.writeStartElement("descripteur")
201
					matches = (desc =~ /(?s)([A-Z]{3}):  ([^;\n]+?) +[;\n]/)
202
					writer.writeAttribute("type", matches[0][1])
203
					writer.writeCharacters(matches[0][2])
204
					writer.writeEndElement() // descripteur
205
				}
206
			} else {
207
				writer.writeCharacters(value)
208
			}
209
			writer.writeEndElement() // p
210
			writer.writeEndElement() // textColumn
211
			writer.writeCharacters("\n")
212
		}
213
		
214
		writer.writeCharacters("  ")
215
		
216
		if (textTag != null && textTag.length() > 0) {
217
			writer.writeEndElement() // textTag
218
			writer.writeCharacters("\n")
219
		}
220
	}
221
	
222
	if (rootTag != null && rootTag.length() > 0) {
223
		writer.writeEndElement() // rootTag
224
		writer.writeCharacters("\n")
225
	}
226
	
227
	if (EmbedInTEI) {
228
		writer.writeEndElement() // text
229
		writer.writeCharacters("\n")
230
		writer.writeEndElement() // TEI
231
		writer.writeCharacters("\n")
232
	}
233
	
234
	writer.close()
235
	output.close()
236
	println "Result file: $outputFile"
237
	
238
} catch (Exception e) {
239
	println "** Excel2XML: unable to read input file. Aborting."
240
	println e.getLocalizedMessage()
241
	println e.printStackTrace()
242
	return false
243
}
244

  
245
return true
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/table/Table2TextCorpusMacro.groovy (revision 3209)
1
package org.txm.macro.csv
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10
import org.txm.libs.msoffice.ReadExcel
11

  
12
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
13
		File inputFile;
14

  
15
@Field @Option(name="outputDirectory", usage="output directory", widget="Folder", required=false, def="directory")
16
		File outputDirectory;
17

  
18
@Field @Option(name="textsSeparator", usage="Texts", widget="Separator", required=false, def="Texts")
19
		def textsSeparator
20

  
21
@Field @Option(name="textIDColumn", usage="<text> id column", widget="String", required=false, def="Identifiant de la notice")
22
		def textIDColumn;
23

  
24
@Field @Option(name="textSelector", usage="<text> lines selector format is column=value", widget="String", required=false, def="Type de notice=Notice sommaire")
25
		def textSelector;
26

  
27
@Field @Option(name="textMetadataColumnList", usage="<text> metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
28
		def textMetadataColumnList;
29

  
30
@Field @Option(name="textContentColumnList", usage="<text> textual content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
31
		def textContentColumnList;
32

  
33
@Field @Option(name="structuresSeparator", usage="Structures", widget="Separator", required=false, def="Structures")
34
		def structuresSeparator
35

  
36
@Field @Option(name="structureTag", usage="structure element to create", widget="String", required=false, def="div")
37
		def structureTag;
38

  
39
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet")
40
		def structureSelector;
41

  
42
@Field @Option(name="textJoinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale")
43
		def textJoinColumn;
44

  
45
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
46
		def structureMetadataColumnList;
47

  
48
@Field @Option(name="structureContentColumnList", usage="structure textual content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
49
		def structureContentColumnList;
50

  
51
@Field @Option(name="typesSeparator", usage="Columns types", widget="Separator", required=false, def="Columns types")
52
		def typesSeparator
53

  
54
@Field @Option(name="dateColumnTypeList", usage="metadata columns of type=Date", widget="String", required=false, def="")
55
		def dateColumnTypeList
56

  
57
@Field @Option(name="prefixesColumnTypeList", usage="metadata columns of type=Prefixes", widget="String", required=false, def="")
58
		def prefixesColumnTypeList
59

  
60
@Field @Option(name="listColumnTypeList", usage="metadata columns of type=List semi-colon separated", widget="String", required=false, def="")
61
		def listColumnTypeList
62

  
63
		@Field @Option(name="debug", usage="Show devug messages", widget="Boolean", required=false, def="false")
64
		def debug
65
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
66
//		def structureOrderColumn;
67

  
68
if (!ParametersDialog.open(this)) return;
69

  
70
textMetadataColumnList = textMetadataColumnList.split(",").collect { it.trim() }
71
textContentColumnList = textContentColumnList.split(",").collect { it.trim() }
72
structureMetadataColumnList = structureMetadataColumnList.split(",").collect { it.trim() }
73
structureContentColumnList = structureContentColumnList.split(",").collect { it.trim() }
74
dateColumnTypeList = dateColumnTypeList.split(",").collect { it.trim() }
75
prefixesColumnTypeList = prefixesColumnTypeList.split(",").collect { it.trim() }
76
listColumnTypeList = listColumnTypeList.split(",").collect { it.trim() }
77
if (textSelector != null && textSelector.contains("=")) {
78
	textSelector = textSelector.trim()
79
	textSelector = [textSelector.substring(0, textSelector.indexOf("=")).trim(), textSelector.substring(textSelector.indexOf("=")+1).trim()]
80
} else {
81
	textSelector = [null, null]
82
}
83
if (structureSelector != null && structureSelector.contains("=")) {
84
	structureSelector = structureSelector.trim()
85
	structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")).trim(), structureSelector.substring(structureSelector.indexOf("=")+1).trim()]
86
} else {
87
	structureSelector = [null, null]
88
}
89

  
90
println "textIDColumn, textJoinColumn=$textIDColumn, $textJoinColumn"
91
println "textMetadataColumnList columns: $textMetadataColumnList"
92
println "textContentColumnList columns: $textContentColumnList"
93
println "structureMetadataColumnList columns: $structureMetadataColumnList"
94
println "structureContentColumnList columns: $structureContentColumnList"
95

  
96
println "text selector="+textSelector
97
println "structure selector="+structureSelector
98
println "structureTag="+structureTag
99

  
100
def reader = null
101
try {
102
	reader = new TableReader(inputFile);
103
	println "Reading $inputFile with TableReader..."
104
} catch(Exception e) {
105
	reader = new ReadExcel(inputFile, null);
106
	println "Reading $inputFile with ReadExcel..."
107
}
108

  
109
if (!reader.readHeaders()) {
110
	println "** Error: no header"
111
	return
112
}
113
def headers = Arrays.asList(reader.getHeaders())
114
println "$inputFile table column names: $headers"
115

  
116
def ok = true
117
def hash = ["selection":[textIDColumn, textJoinColumn], "textMetadataColumnList":textMetadataColumnList,
118
	"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList,
119
	"structureContentColumnList":structureContentColumnList]
120
for (def key : hash.keySet()) {
121
	for (def m : hash[key]) {
122
		if (!headers.contains(m)) {
123
			println "** Error: missing $key column: $m"
124
			ok = false
125
		}
126
}
127
}
128

  
129
if (!ok) { return; }
130

  
131
// group records by text
132
def texts = new LinkedHashMap()
133
def nRecord = 0
134
def nRecordToWrite = 0
135
while (reader.readRecord()) {
136
nRecord++
137
//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0])
138

  
139
String id = reader.get(textIDColumn).trim()
140
String join = reader.get(textJoinColumn).trim()
141
String textSelectorValue = reader.get(textSelector[0]).trim()
142
String structureSelectorValue = reader.get(structureSelector[0]).trim()
143

  
144
if (textIDColumn != null && textJoinColumn != null && textIDColumn.length() > 0 && textJoinColumn.length() > 0) {
145
	if (textSelectorValue != null && structureSelectorValue != null) {
146
		
147
		if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
148
			if (!texts.containsKey(id)) texts[id] = []
149
			texts[id].add(0, reader.getRecord())
150
		} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
151
			if (!texts.containsKey(join)) texts[join] = []
152
			texts[join].add(reader.getRecord())
153
			nRecordToWrite++
154
		} else {
155
			// ignore record
156
		}
157
	} else { // no text&structure selector set, take the record if 'id' or 'join' are set
158
		if (id.length() > 0) {
159
			if (!texts.containsKey(id)) texts[id] = []
160
			texts[id].add(0, reader.getRecord())
161
		} else if (join.length() > 0) {
162
			if (!texts.containsKey(join)) texts[join] = []
163
			texts[join].add(reader.getRecord())
164
			nRecordToWrite++
165
		} else {
166
			// ignore record
167
		}
168
	}
169
} else { // no id & join parameter set -> regroup all record in one file
170
	if (!texts.containsKey("result.xml")) texts["result.xml"] = []
171
	texts["result.xml"].add(reader.getRecord())
172
	nRecordToWrite++
173
}
174
}
175

  
176
println "N records: "+nRecord
177
println "N records to write: "+nRecordToWrite
178
println "N texts to build: "+texts.size()
179
if (texts.size()  == 0) {
180
println "No text found. Aborting."
181
return
182
}
183
outputDirectory.mkdir()
184

  
185
for (def id : texts.keySet()) {
186
def toWrite = texts[id]
187
def text = toWrite[0]
188
String textSelectorValue = text.get(textSelector[0]).trim()
189
if (textSelectorValue == null || (textSelectorValue != null && textSelectorValue.matches(textSelector[1]))) {
190
	if (debug) println "Processing text: $id"
191
	
192
	File outputfile = new File(outputDirectory, id+".xml")
193
	XMLOutputFactory factory = XMLOutputFactory.newInstance()
194
	FileOutputStream output = new FileOutputStream(outputfile)
195
	XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
196
	
197
	writer.writeStartDocument("UTF-8","1.0")
198
	writer.writeCharacters("\n") // simple XML formating
199
	writer.writeStartElement("TEI")
200
	writer.writeCharacters("\n") // simple XML formating
201
	writer.writeStartElement("teiHeader")
202
	writer.writeEndElement() // teiHeader
203
	writer.writeCharacters("\n") // simple XML formating
204
	writer.writeStartElement("text")
205
	writer.writeAttribute("id", id)
206
	
207
	for (String att : textMetadataColumnList) {
208
		if (att in dateColumnTypeList) {
209
			writeMetadataDate(text, att, writer)
210
		} else {
211
			writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct
212
		}
213
	}
214
	
215
	writer.writeCharacters("\n") // simple XML formating
216
	
217
	writer.writeStartElement("metadata")
218
	writer.writeStartElement("list")
219
	writer.writeAttribute("type", "unordered")
220
	writer.writeCharacters("\n")
221
	textMetadataColumnList.each { att ->
222
		writer.writeStartElement("item")
223
		writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
224
		writer.writeEndElement() // item
225
		writer.writeCharacters("\n")
226
	}
227
	writer.writeEndElement() // list
228
	writer.writeEndElement() // metadata
229
	writer.writeCharacters("\n")
230
	
231
	for (String att : textContentColumnList) {
232
		
233
		if (att in prefixesColumnTypeList) {
234
			
235
			writePrefixTextContent(text, att, writer)
236
		} else if (att in listColumnTypeList) {
237
			
238
			writeListTextContent(text, att, writer)
239
		} else {
240
			
241
			writer.writeStartElement("p");
242
			writer.writeAttribute("type", att.trim())
243
			writer.writeCharacters("\n") // simple XML formating
244
			
245
			writer.writeStartElement("head")
246
			writer.writeStartElement("hi")
247
			writer.writeCharacters(att+" : ")
248
			writer.writeEndElement() // hi
249
			writer.writeEndElement() // head
250
			
251
			writer.writeCharacters(text.get(att)) // get textColumnList content
252
			writer.writeEndElement() // t
253
			writer.writeCharacters("\n") // simple XML formating)
254
		}
255
	}
256
	
257
	int pb_n = 1;
258
	for (int i = 1 ; i < toWrite.size() ; i++) {
259
		def record = toWrite[i]
260
		
261
		writer.writeEmptyElement("pb") // <pb/>
262
		writer.writeAttribute("n", ""+pb_n++)
263
		writer.writeAttribute("type", "record")
264
		
265
		writer.writeStartElement(structureTag)
266
		
267
		for (String att : structureMetadataColumnList) {
268
			if (att in dateColumnTypeList) {
269
				writeMetadataDate(record, att, writer)
270
			} else {
271
				writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct
272
			}
273
		}
274
		writer.writeCharacters("\n")
275
		
276
		writer.writeStartElement("metadata")
277
		writer.writeStartElement("list")
278
		writer.writeAttribute("type", "unordered")
279
		writer.writeCharacters("\n")
280
		structureMetadataColumnList.each { att ->
281
			writer.writeStartElement("item")
282
			writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
283
			writer.writeEndElement() // item
284
			writer.writeCharacters("\n")
285
		}
286
		writer.writeEndElement() // list
287
		writer.writeEndElement() // metadata
288
		writer.writeCharacters("\n")
289
		
290
		for (String att : structureContentColumnList) {
291
			if (att in prefixesColumnTypeList) {
292
				writePrefixTextContent(text, att, writer)
293
			} else if (att in listColumnTypeList) {
294
				writeListTextContent(text, att, writer)
295
			} else {
296
				writer.writeStartElement("p");
297
				writer.writeAttribute("type", att.trim())
298
				writer.writeAttribute("id", record.get(textIDColumn))
299
				writer.writeCharacters("\n") // simple XML formating
300
				
301
				writer.writeStartElement("head")
302
				writer.writeStartElement("hi")
303
				writer.writeCharacters(att+" : ")
304
				writer.writeEndElement() // hi
305
				writer.writeEndElement() // head
306
				
307
				writer.writeCharacters(record.get(att)) // get textColumnList content
308
				writer.writeEndElement() // t
309
				writer.writeCharacters("\n") // simple XML formating
310
			}
311
		}
312
		
313
		writer.writeEndElement() // struct
314
		writer.writeCharacters("\n") // simple XML formating
315
	}
316
	
317
	writer.writeEndElement() // text
318
	writer.writeCharacters("\n") // simple XML formating
319
	writer.writeEndElement() // TEI
320
	writer.close()
321
	output.close()
322
	reader.close()
323
} else {
324
	// error
325
	println "ERROR: '$id' text group with  no text line"
326
}
327
}
328

  
329
def writeListTextContent(def record, def att, def writer) {
330
writer.writeCharacters("\n")
331
found = false
332
def value = record.get(att)
333
value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
334
	found = true
335
	writer.writeCharacters("\t")
336
	writer.writeStartElement("p")
337
	writer.writeAttribute("rend", "list")
338
	matches = (desc =~ /(?s)([^\n]+?)[\n]/)
339
	writer.writeCharacters(matches[0][1])
340
	writer.writeEndElement() // p
341
	writer.writeCharacters("\n")
342
}
343
if (!found) {
344
	writer.writeCharacters("\t")
345
	writer.writeStartElement("p")
346
	writer.writeAttribute("rend", "no-list")
347
	writer.writeCharacters(value)
348
	writer.writeEndElement() // p
349
	writer.writeCharacters("\n")
350
}
351
}
352

  
353
def writePrefixTextContent(def record, def att, def writer) {
354
writer.writeCharacters("\n")
355
writer.writeStartElement("list")
356
writer.writeAttribute("rend", "prefixes")
357
writer.writeAttribute("type", "unordered")
358
writer.writeCharacters("\n")
359
found = false
360
def value = record.get(att)
361
value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
362
	found = true
363
	writer.writeCharacters("\t")
364
	writer.writeStartElement("item")
365
	matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
366
	writer.writeAttribute("type", matches[0][1])
367
	writer.writeStartElement("span")
368
	writer.writeCharacters(matches[0][1]+" ")
369
	writer.writeEndElement() // span
370
	writer.writeCharacters(matches[0][2])
371
	writer.writeEndElement() // item
372
	writer.writeCharacters("\n")
373
}
374
if (!found) {
375
	writer.writeCharacters(value)
376
}
377
writer.writeEndElement() // list
378
writer.writeCharacters("\n")
379
}
380

  
381
def writeMetadataDate(def record, def att, def writer) {
382
String value = record.get(att)
383
String att_normalized = AsciiUtils.buildAttributeId(att)
384
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
385
writer.writeAttribute(att_normalized+"jour", matches[0][1])
386
writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
387
writer.writeAttribute(att_normalized+"mois", matches[0][2])
388
writer.writeAttribute(att_normalized+"annee", matches[0][3])
389
writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1])
390
}
391

  
392
/*
393
 String name = inputFile.getName()
394
 int idx = name.lastIndexOf(".")
395
 if (idx > 0) name = name.substring(0, idx)
396
 */
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/WordProperty2WordMacro.groovy (revision 3209)
1
// Copyright © 2021 ENS Lyon
2
// Licensed under the terms of the GNU General Public License version 3 (http://www.gnu.org/licenses/gpl-3.0.html)
3
// @author mdecorde
4
// @author sheiden
1 5

  
2 6
package org.txm.macro.annotation
3 7

  
......
28 32
Project project = mcorpus.getProject()
29 33
File txmDir = new File(project.getProjectDirectory(), "txm/"+mcorpus.getName())
30 34

  
31
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="result_directory")
35
@Field @Option(name="outputDirectory", usage="Directory of resulting XML-TXM files", widget="Folder", required=true, def="")
32 36
File outputDirectory
33 37

  
34
@Field @Option(name="word_property", usage="The word property to move", widget="String", required=true, def="type")
38
@Field @Option(name="word_property", usage="The word property to project", widget="String", required=true, def="type")
35 39
String word_property
36 40

  
37
@Field @Option(name="word_property_value_to_ignore", usage="The property value to ignore", widget="String", required=true, def="__undef__")
38
String word_property_value_to_ignore
41
@Field @Option(name="values_to_ignore_regex", usage="regex of values not to project", widget="String", required=true, def="")
42
String values_to_ignore_regex
39 43

  
44
@Field @Option(name="debug", usage="Afficher les messages de debug", widget="Boolean", required=true, def="false")
45
def debug
46

  
40 47
// Open the parameters input dialog box
41 48
if (!ParametersDialog.open(this)) return
42 49

  
......
45 52
	return false
46 53
}
47 54

  
55
values_to_ignore_regex = /$values_to_ignore_regex/
56

  
48 57
for (Text text : project.getChildren(Text.class)) {
49 58
	println "Text: "+text
50 59
	
......
62 71
	}
63 72
	
64 73
	XMLProcessor xp = new XMLProcessor(orig);
65
	LocalNameHookActivator activator = new LocalNameHookActivator("w");
74
	LocalNameHookActivator activator = new LocalNameHookActivator("w")
66 75
	new DOMIdentityHook("word", activator, xp) {
67 76
		@Override
68 77
		public void processDom() {
69 78
			//println dom
70 79
			use(groovy.xml.dom.DOMCategory) {
71 80
				def form = dom.form[0]
72
				def anatype = dom.ana.findAll(){ a -> a['@type'] == "#$word_property"}
81
				def anatype = dom.ana.findAll(){ a ->
82
					if (debug && (a['@type'] == "#$word_property")) {
83
						println 'a["@type"] = '+a["@type"]
84
						println 'a.text() = '+a.text()
85
						println 'a["@type"] == "#$word_property"'+" = "+(a['@type'] == "#$word_property")
86
						println "values_to_ignore_regex = "+values_to_ignore_regex
87
						println '!(a.text() ==~ values_to_ignore_regex))'+" = "+(!(a.text() ==~ values_to_ignore_regex))
88
						println '((a["@type"] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex)) = '+((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex))
89
					}
90
					return ((a['@type'] == "#$word_property") && !(a.text() ==~ values_to_ignore_regex))
91
				}
73 92
//				println form.text()+" -> $anatype"
74 93
				if (anatype.size() > 0) { // keep the word
75 94
					anatype = anatype[0]
......
78 97
					anatype['@type'] = "#form"
79 98
					anatype.setTextContent(s)
80 99
				} else {
81
					dom = null;
100
					dom = null
82 101
				}
83 102
			}
84 103
		}
85 104
	}
86 105
	xp.process(result)
87
}
106
}

Formats disponibles : Unified diff