Révision 2037

tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/Metadata2TeiHeader.groovy (revision 2037)
1
// STANDARD DECLARATIONS
2
package org.txm.macro.urs.democrat
3

  
4
import java.nio.charset.Charset
5

  
6
import org.kohsuke.args4j.*
7
import groovy.transform.Field
8
import net.sf.saxon.functions.IndexOf
9

  
10
import org.txm.importer.StaxIdentityParser
11
import org.txm.objects.*
12
import org.txm.rcp.swt.widget.parameters.*
13
import org.txm.searchengine.cqp.corpus.*
14
import org.txm.utils.CsvReader
15
import org.txm.utils.io.IOUtils
16
import org.txm.scripts.importer.XPathResult
17
import javax.xml.xpath.XPathConstants
18

  
19
class Metadata2TEiHeader {
20
	
21
	int debug = 0;
22
	public Metadata2TEiHeader(int debug) {
23
		this.debug = debug
24
	}
25
	//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml")
26
	//def teiHeaderTemplateFile
27
	//
28
	//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
29
	//def xpathFile
30
	//
31
	//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
32
	//def metadataFile
33
	//
34
	//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
35
	//debug
36
	//
37
	//if (!ParametersDialog.open(this)) return;
38
	//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
39
	//
40
	//def xpathProperties = new Properties()
41
	//xpathProperties.load(IOUtils.getReader(xpathFile))
42
	//println xpathProperties
43
	//
44
	//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
45
	//println csvReader
46
	//
47
	//csvReader.readHeaders();
48
	//def header = csvReader.getHeaders()
49
	//if (!header.contains("id")) {
50
	//	println "** 'id' column not found in $metadataFile header=$header"
51
	//	return;
52
	//}
53
	//if (!header.contains("corpus_id")) {
54
	//	println "** 'corpus_id' column not found in $metadataFile header=$header"
55
	//	return;
56
	//}
57
	//
58
	//def corpora = [:]
59
	//for (Project project : Workspace.getInstance().getProjects()) {
60
	//	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
61
	//		corpora[corpus.getID()] = corpus;
62
	//	}
63
	//}
64
	//
65
	//while (csvReader.readRecord())	{
66
	//	String text_id = csvReader.get("id")
67
	//	if (text_id == null || text_id.length() == 0) {
68
	//		//println "** @id not found for record="+csvReader.getRawRecord()
69
	//		continue;
70
	//	}
71
	//
72
	//	String corpus_id = csvReader.get("corpus_id")
73
	//	if (corpus_id == null || corpus_id.length() == 0) {
74
	//		println "** @corpus_id not found for record="+csvReader.getRawRecord()
75
	//		continue;
76
	//	}
77
	//
78
	//	MainCorpus corpus = corpora[corpus_id]
79
	//	if (corpus == null) {
80
	//		println "** no corpus found for ID=$corpus_id"
81
	//		continue;
82
	//	}
83
	//	def project = corpus.getProject()
84
	//
85
	//	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
86
	//
87
	//	if (!txmDir.exists()) {
88
	//		println "** the selected corpus has no XML-TXM files. Aborting."
89
	//	}
90
	//
91
	//
92
	//	File txmFile = new File(txmDir, text_id+".xml")
93
	//	if (txmFile.exists()) {
94
	//
95
	//		def data = [:]
96
	//
97
	//		for (def h : header) data[h] = csvReader.get(h)
98
	//
99
	//		String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
100
	//		if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
101
	//			injecting(txmFile, xmlteiHeaderContent)
102
	//		} else {
103
	//			println "** Text header not updated: $txmFile"
104
	//		}
105
	//	} else {
106
	//		println "** Text not found: $txmFile"
107
	//	}
108
	//}
109

  
110
	def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) {
111
		XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile);
112
		for (String info : data.keySet()) {
113
			String xpath = xpathProperties[info];
114
			String value = data[info]
115
			if (xpath == null) {
116
				continue; // not a data to inject
117
			}
118
			if (debug > 1) println "   injecting '$info'='$value' in '$xpath'"
119

  
120
			def expr = xpathProcessor.xpath.compile(xpath);
121
			def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
122
			if (first != null) {
123
				switch (first.getNodeType()) {
124
					case 1: // element
125
						if (debug > 1) println "   patching: $info with "+value
126
						def newChild = xpathProcessor.doc.createElement(first.getTagName())
127
						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
128
						
129
						def attributes = first.getAttributes();
130
						for (int i = 0 ; i < attributes.getLength() ; i++) { // copy attributes
131
							def attr = attributes.item(i)
132
							newChild.setAttribute(attr.getNodeName(), attr.getNodeValue())
133
						}
134
						
135
						first.getParentNode().replaceChild(newChild, first)
136
						break;
137
					case 2: // attribute
138
						if (debug > 1) println "   patching attribute: $info with "+value
139
						first.setNodeValue(value)
140
						break;
141
					case 3: // text
142
						if (debug > 1) println "   patching text: $info with "+value
143
						first.setNodeValue(value)
144
						break;
145
					default:
146
						break
147
				}
148
			} else {
149
				//try finding parent and add a new node
150
				String attribute_xpath = null;
151
				if (xpath.lastIndexOf("/@") > 0) {
152
					attribute_xpath = xpath.substring(xpath.lastIndexOf("/@")+2)
153
					xpath = xpath.substring(0, xpath.lastIndexOf("/@"))
154
				}
155
				String parent_xpath = xpath.substring(0, xpath.lastIndexOf("/"))
156
				String element_xpath = xpath.substring(xpath.lastIndexOf("/")+1)
157
				def parent_expr = xpathProcessor.xpath.compile(parent_xpath);
158
				def parent = parent_expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
159
				if (parent != null) {
160
					String namespace = element_xpath.substring(element_xpath.indexOf(":"))
161
					String nodename = element_xpath.substring(element_xpath.indexOf(":")+1)
162
					def attributes = [:]
163
					if (nodename.contains("[")) {
164
						
165
						String attributesString = nodename.substring(nodename.indexOf("[")+1, nodename.length()-1)
166
						for (String attributeString : attributesString.split(",")) {
167
							def split = attributeString.split("=")
168
							attributes[split[0].substring(1)] = split[1].substring(1, split[1].length()-1) // remove @ and remove ""
169
						}
170
						nodename = nodename.substring(0, nodename.indexOf("["))
171
					}
172
					def newChild = xpathProcessor.doc.createElement(nodename)
173
					for (String attributeName : attributes.keySet()) {
174
						newChild.setAttribute(attributeName, attributes[attributeName])
175
					}
176
					
177
					if (attribute_xpath != null) {
178
						newChild.setAttribute(attribute_xpath, value)
179
					} else {
180
						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
181
					}
182
					
183
					if (debug > 1) println "  creating node $parent_xpath / $nodename[$attributes] : $attribute_xpath = $value"
184
					parent.appendChild(newChild)
185
				} else {
186
					println " ** info=$info not found or created for "+xpathProperties[info]
187
				}
188
			}
189
		}
190

  
191
		//	return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
192
		//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
193
		String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0)
194
		content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
195
	}
196

  
197
	/**
198
	 * Replace the teiHeader 
199
	 * @param txmFile
200
	 * @param data
201
	 * @param paths
202
	 * @param xmlteiHeaderContent
203
	 * @return
204
	 */
205
	def replaceHeader(File txmFile, String xmlteiHeaderContent) {
206
		println " editing: $txmFile..."
207
		StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
208
					boolean start = false;
209
					public void processStartElement() {
210

  
211
						if (localname == "teiHeader") {
212
							start = true;
213
							if (debug > 2) println "   replacing teiHeader"
214
							if (debug > 3) println "       with $xmlteiHeaderContent"
215
							output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8")));  // REPLACE CONTENT !
216
						}
217
						if (!start) {
218
							super.processStartElement();
219
						}
220
					}
221

  
222
					public void processEndElement() {
223
						if (!start) {
224
							super.processEndElement();
225
						}
226
						if (localname == "teiHeader") {
227
							start = false;
228
							if (debug > 2) println "   replace done"
229
						}
230
					}
231

  
232
					public void processCharacters() {
233
						if (!start) {
234
							super.processCharacters();
235
						}
236
					}
237

  
238
					public void processComment() {
239
						if (!start) {
240
							super.processComment();
241
						}
242
					}
243
				}
244

  
245
		File tmpDirectory = new File("/home/mdecorde/TEMP");
246
		File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName())
247
		File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName())
248

  
249
		if (sparser.process(outfile)) { // replace inputFile
250
			if (txmFile.renameTo(copyFile)) {
251
				if (outfile.renameTo(txmFile)) {
252
					println " -> SUCCESS see $txmFile"
253
					return true
254
				} else {
255
					println " -> FAIL could not replace $txmFile"
256
					println " -> see result in $outfile"
257

  
258
				}
259
			} else {
260
				println " -> FAIL could not make a copy of $txmFile in $copyFile"
261
				println " -> see result in $outfile"
262
			}
263

  
264
		} else {
265
			println " -> FAIL see $outfile"
266
		}
267

  
268
		return false;
269
	}
270
}
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/Metadata2TEIHeader.groovy (revision 2037)
1
// STANDARD DECLARATIONS
2
package org.txm.macro.urs.democrat
3

  
4
import java.nio.charset.Charset
5

  
6
import org.kohsuke.args4j.*
7
import groovy.transform.Field
8
import net.sf.saxon.functions.IndexOf
9

  
10
import org.txm.importer.StaxIdentityParser
11
import org.txm.objects.*
12
import org.txm.rcp.swt.widget.parameters.*
13
import org.txm.searchengine.cqp.corpus.*
14
import org.txm.utils.CsvReader
15
import org.txm.utils.io.IOUtils
16
import org.txm.scripts.importer.XPathResult
17
import javax.xml.xpath.XPathConstants
18

  
19
class Metadata2TEIHeader {
20
	
21
	int debug = 0;
22
	public Metadata2TEIHeader(int debug) {
23
		this.debug = debug
24
	}
25
	//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml")
26
	//def teiHeaderTemplateFile
27
	//
28
	//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
29
	//def xpathFile
30
	//
31
	//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
32
	//def metadataFile
33
	//
34
	//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
35
	//debug
36
	//
37
	//if (!ParametersDialog.open(this)) return;
38
	//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
39
	//
40
	//def xpathProperties = new Properties()
41
	//xpathProperties.load(IOUtils.getReader(xpathFile))
42
	//println xpathProperties
43
	//
44
	//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
45
	//println csvReader
46
	//
47
	//csvReader.readHeaders();
48
	//def header = csvReader.getHeaders()
49
	//if (!header.contains("id")) {
50
	//	println "** 'id' column not found in $metadataFile header=$header"
51
	//	return;
52
	//}
53
	//if (!header.contains("corpus_id")) {
54
	//	println "** 'corpus_id' column not found in $metadataFile header=$header"
55
	//	return;
56
	//}
57
	//
58
	//def corpora = [:]
59
	//for (Project project : Workspace.getInstance().getProjects()) {
60
	//	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
61
	//		corpora[corpus.getID()] = corpus;
62
	//	}
63
	//}
64
	//
65
	//while (csvReader.readRecord())	{
66
	//	String text_id = csvReader.get("id")
67
	//	if (text_id == null || text_id.length() == 0) {
68
	//		//println "** @id not found for record="+csvReader.getRawRecord()
69
	//		continue;
70
	//	}
71
	//
72
	//	String corpus_id = csvReader.get("corpus_id")
73
	//	if (corpus_id == null || corpus_id.length() == 0) {
74
	//		println "** @corpus_id not found for record="+csvReader.getRawRecord()
75
	//		continue;
76
	//	}
77
	//
78
	//	MainCorpus corpus = corpora[corpus_id]
79
	//	if (corpus == null) {
80
	//		println "** no corpus found for ID=$corpus_id"
81
	//		continue;
82
	//	}
83
	//	def project = corpus.getProject()
84
	//
85
	//	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
86
	//
87
	//	if (!txmDir.exists()) {
88
	//		println "** the selected corpus has no XML-TXM files. Aborting."
89
	//	}
90
	//
91
	//
92
	//	File txmFile = new File(txmDir, text_id+".xml")
93
	//	if (txmFile.exists()) {
94
	//
95
	//		def data = [:]
96
	//
97
	//		for (def h : header) data[h] = csvReader.get(h)
98
	//
99
	//		String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
100
	//		if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
101
	//			injecting(txmFile, xmlteiHeaderContent)
102
	//		} else {
103
	//			println "** Text header not updated: $txmFile"
104
	//		}
105
	//	} else {
106
	//		println "** Text not found: $txmFile"
107
	//	}
108
	//}
109

  
110
	def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) {
111
		XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile);
112
		for (String info : data.keySet()) {
113
			String xpath = xpathProperties[info];
114
			String value = data[info]
115
			if (xpath == null) {
116
				continue; // not a data to inject
117
			}
118
			if (debug > 1) println "   injecting '$info'='$value' in '$xpath'"
119

  
120
			def expr = xpathProcessor.xpath.compile(xpath);
121
			def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
122
			if (first != null) {
123
				switch (first.getNodeType()) {
124
					case 1: // element
125
						if (debug > 1) println "   patching: $info with "+value
126
						def newChild = xpathProcessor.doc.createElement(first.getTagName())
127
						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
128
						
129
						def attributes = first.getAttributes();
130
						for (int i = 0 ; i < attributes.getLength() ; i++) { // copy attributes
131
							def attr = attributes.item(i)
132
							newChild.setAttribute(attr.getNodeName(), attr.getNodeValue())
133
						}
134
						
135
						first.getParentNode().replaceChild(newChild, first)
136
						break;
137
					case 2: // attribute
138
						if (debug > 1) println "   patching attribute: $info with "+value
139
						first.setNodeValue(value)
140
						break;
141
					case 3: // text
142
						if (debug > 1) println "   patching text: $info with "+value
143
						first.setNodeValue(value)
144
						break;
145
					default:
146
						break
147
				}
148
			} else {
149
				//try finding parent and add a new node
150
				String attribute_xpath = null;
151
				if (xpath.lastIndexOf("/@") > 0) {
152
					attribute_xpath = xpath.substring(xpath.lastIndexOf("/@")+2)
153
					xpath = xpath.substring(0, xpath.lastIndexOf("/@"))
154
				}
155
				String parent_xpath = xpath.substring(0, xpath.lastIndexOf("/"))
156
				String element_xpath = xpath.substring(xpath.lastIndexOf("/")+1)
157
				def parent_expr = xpathProcessor.xpath.compile(parent_xpath);
158
				def parent = parent_expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
159
				if (parent != null) {
160
					String namespace = element_xpath.substring(element_xpath.indexOf(":"))
161
					String nodename = element_xpath.substring(element_xpath.indexOf(":")+1)
162
					def attributes = [:]
163
					if (nodename.contains("[")) {
164
						
165
						String attributesString = nodename.substring(nodename.indexOf("[")+1, nodename.length()-1)
166
						for (String attributeString : attributesString.split(",")) {
167
							def split = attributeString.split("=")
168
							attributes[split[0].substring(1)] = split[1].substring(1, split[1].length()-1) // remove @ and remove ""
169
						}
170
						nodename = nodename.substring(0, nodename.indexOf("["))
171
					}
172
					def newChild = xpathProcessor.doc.createElement(nodename)
173
					for (String attributeName : attributes.keySet()) {
174
						newChild.setAttribute(attributeName, attributes[attributeName])
175
					}
176
					
177
					if (attribute_xpath != null) {
178
						newChild.setAttribute(attribute_xpath, value)
179
					} else {
180
						newChild.appendChild(xpathProcessor.doc.createTextNode(value))
181
					}
182
					
183
					if (debug > 1) println "  creating node $parent_xpath / $nodename[$attributes] : $attribute_xpath = $value"
184
					parent.appendChild(newChild)
185
				} else {
186
					println " ** info=$info not found or created for "+xpathProperties[info]
187
				}
188
			}
189
		}
190

  
191
		//	return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
192
		//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
193
		String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0)
194
		content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
195
	}
196

  
197
	/**
198
	 * Replace the teiHeader 
199
	 * @param txmFile
200
	 * @param data
201
	 * @param paths
202
	 * @param xmlteiHeaderContent
203
	 * @return
204
	 */
205
	def replaceHeader(File txmFile, String xmlteiHeaderContent) {
206
		println " editing: $txmFile..."
207
		StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
208
					boolean start = false;
209
					public void processStartElement() {
210

  
211
						if (localname == "teiHeader") {
212
							start = true;
213
							if (debug > 2) println "   replacing teiHeader"
214
							if (debug > 3) println "       with $xmlteiHeaderContent"
215
							output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8")));  // REPLACE CONTENT !
216
						}
217
						if (!start) {
218
							super.processStartElement();
219
						}
220
					}
221

  
222
					public void processEndElement() {
223
						if (!start) {
224
							super.processEndElement();
225
						}
226
						if (localname == "teiHeader") {
227
							start = false;
228
							if (debug > 2) println "   replace done"
229
						}
230
					}
231

  
232
					public void processCharacters() {
233
						if (!start) {
234
							super.processCharacters();
235
						}
236
					}
237

  
238
					public void processComment() {
239
						if (!start) {
240
							super.processComment();
241
						}
242
					}
243
				}
244

  
245
		File tmpDirectory = new File("/home/mdecorde/TEMP");
246
		File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName())
247
		File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName())
248

  
249
		if (sparser.process(outfile)) { // replace inputFile
250
			if (txmFile.renameTo(copyFile)) {
251
				if (outfile.renameTo(txmFile)) {
252
					println " -> SUCCESS see $txmFile"
253
					return true
254
				} else {
255
					println " -> FAIL could not replace $txmFile"
256
					println " -> see result in $outfile"
257

  
258
				}
259
			} else {
260
				println " -> FAIL could not make a copy of $txmFile in $copyFile"
261
				println " -> see result in $outfile"
262
			}
263

  
264
		} else {
265
			println " -> FAIL see $outfile"
266
		}
267

  
268
		return false;
269
	}
270
}
tmp/org.txm.analec.rcp/src/org/txm/macro/urs/democrat/BuildDemocratCorpusVersion1Macro.groovy (revision 2037)
150 150
	println " compute measures..."
151 151
	def data = new ComputeDemocratMeasureMetadata().getStats(corpus, xpathProperties)
152 152
	
153
	Metadata2TEiHeader mthm = new Metadata2TEiHeader(debug);
153
	Metadata2TEIHeader mthm = new Metadata2TEIHeader(debug);
154 154
	for (def h : header) data[h] = csvReader.get(h)
155 155
	
156 156
	println " creating teiHeader..."

Formats disponibles : Unified diff