Révision 1972

tmp/org.txm.core/src/java/org/txm/scripts/importer/XPathResult.groovy (revision 1972)
2 2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 3
// Lyon 2, University of Franche-Comté, University of Nice
4 4
// Sophia Antipolis, University of Paris 3.
5
// 
5
//
6 6
// The TXM platform is free software: you can redistribute it
7 7
// and/or modify it under the terms of the GNU General Public
8 8
// License as published by the Free Software Foundation,
9 9
// either version 2 of the License, or (at your option) any
10 10
// later version.
11
// 
11
//
12 12
// The TXM platform is distributed in the hope that it will be
13 13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 15
// PURPOSE. See the GNU General Public License for more
16 16
// details.
17
// 
17
//
18 18
// You should have received a copy of the GNU General
19 19
// Public License along with the TXM platform. If not, see
20 20
// http://www.gnu.org/licenses.
21
// 
22
// 
23
// 
21
//
22
//
23
//
24 24
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
25 25
// $LastChangedRevision: 3426 $
26
// $LastChangedBy: mdecorde $ 
26
// $LastChangedBy: mdecorde $
27 27
//
28 28
package org.txm.scripts.importer;
29 29

  
......
51 51
 * return the id of a bfm tag <milestone/>
52 52
 */
53 53
public class XPathResult {
54
	
54

  
55 55
	/** The doc. */
56 56
	Document doc;
57 57
	XPath xpath;
......
64 64
	public XPathResult(File xmlfile) {
65 65
		this(xmlfile, true)
66 66
	}
67
	
67

  
68 68
	/**
69 69
	 * Instantiates a new x path result.
70 70
	 *
......
74 74
		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
75 75
		factory.setXIncludeAware(true);
76 76
		factory.setNamespaceAware(namespaceAware); // never forget this!
77
		
77

  
78 78
		DocumentBuilder builder = factory.newDocumentBuilder();
79 79
		doc = builder.parse(xmlfile);
80
		
80

  
81 81
		XPathFactory xfactory = XPathFactory.newInstance();
82 82
		xpath = xfactory.newXPath();
83 83
		xpath.setNamespaceContext(new PersonalNamespaceContext());
84 84
	}
85
	
85

  
86
	public Document getDocument() {
87
		return doc;
88
	}
89

  
86 90
	public def getNodes(String query) {
87 91
		def rnodes = [];
88
		
92

  
89 93
		XPathExpression expr = xpath.compile(query);
90 94
		Object result = expr.evaluate(doc.getDocumentElement(), XPathConstants.NODESET);
91
		
92
		NodeList nodes = (NodeList) result;
93
		for (int i = 0; i < nodes.getLength(); i++) {
94
			rnodes.add(nodes.item(i));
95
		if (result instanceof NodeList) {
96
			NodeList nodes = (NodeList) result;
97
			for (int i = 0; i < nodes.getLength(); i++) {
98
				rnodes.add(nodes.item(i));
99
			}
100
		} else {
101
			result.getClass()
102
			rnodes << result
95 103
		}
96 104
		return rnodes;
97 105
	}
98
	
106

  
99 107
	public String getXpathResponse(String query) {
100 108
		XPathExpression expr = xpath.compile(query);
101 109
		Object result = expr.evaluate(doc.getDocumentElement(), XPathConstants.NODESET);
102
		
110

  
103 111
		NodeList nodes = (NodeList) result;
104 112
		for (int i = 0; i < nodes.getLength(); i++) {
105 113
			//println nodes.item(i)
106 114
			return (nodes.item(i).getNodeValue());
107 115
		}
108 116
	}
109
	
117

  
110 118
	public ArrayList<String> getXpathResponses(String query) {
111 119
		ArrayList<String> xresult = new ArrayList<String>();
112 120
		XPathExpression expr = xpath.compile(query);
113 121
		Object result = expr.evaluate(doc.getDocumentElement(), XPathConstants.NODESET);
114
		
122

  
115 123
		NodeList nodes = (NodeList) result;
116 124
		for (int i = 0; i < nodes.getLength(); i++) {
117 125
			//println nodes.item(i)
......
119 127
		}
120 128
		return xresult
121 129
	}
122
	
130

  
123 131
	public String getXpathResponse(String query, String devaultValue) {
124 132
		String rez = getXpathResponse(query);
125 133
		if (rez == null)
126 134
			return devaultValue;
127 135
		return rez;
128 136
	}
129
	
137

  
130 138
	public void close() {
131 139
		xpath = null;
132 140
		doc = null;
133 141
	}
134 142

  
135 143
	/**
136

  
137 144
	 * OBSOLETE VERSION FOR TXM return the node text content given a XPath
138

  
139 145
	 * "//path.../.../@attr"
140 146
	 * 
141 147
	 * @param path
......
164 170
	 * 
165 171
	 * } return ""; }
166 172
	 */
167
	
173

  
168 174
	static public String getXpathResponse(File xmlfile, String query, String devaultValue) {
169 175
		String rez = getXpathResponse(xmlfile, query);
170 176
		if (rez == null)
171 177
			return devaultValue;
172 178
		return rez;
173 179
	}
174
	
180

  
175 181
	static public String getXpathResponse(File xmlfile, String query, String devaultValue, boolean namespaceAware) {
176 182
		String rez = getXpathResponse(xmlfile, query, namespaceAware);
177 183
		if (rez == null)
......
189 195
	static public String getXpathResponse(File xmlfile, String query) {
190 196
		return getXpathResponse(xmlfile, query, true);
191 197
	}
192
	
198

  
193 199
	/**
194 200
	 * Gets the xpath response.
195 201
	 *
......
200 206
	static public String getXpathResponse(File xmlfile, String query, boolean namespaceAware) {
201 207
		XPathResult result = new XPathResult(xmlfile);
202 208
		return result.getXpathResponse(query);
203
//		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
204
//		factory.setNamespaceAware(namespaceAware); // never forget this!
205
//		
206
//		DocumentBuilder builder = factory.newDocumentBuilder();
207
//		Document doc = builder.parse(xmlfile);
208
//		
209
//		XPathFactory xfactory = XPathFactory.newInstance();
210
//		XPath xpath = xfactory.newXPath();
211
//		xpath.setNamespaceContext(new PersonalNamespaceContext());
212
//		
213
//		XPathExpression expr = xpath.compile(query);
214
//		Object result = expr.evaluate(doc, XPathConstants.NODESET);
215
//		
216
//		NodeList nodes = (NodeList) result;
217
//		println "size: "+result.getLength()
218
//		for (int i = 0; i < nodes.getLength(); i++) {
219
//			return (nodes.item(i).getNodeValue());
220
//		}
209
		//		DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
210
		//		factory.setNamespaceAware(namespaceAware); // never forget this!
211
		//
212
		//		DocumentBuilder builder = factory.newDocumentBuilder();
213
		//		Document doc = builder.parse(xmlfile);
214
		//
215
		//		XPathFactory xfactory = XPathFactory.newInstance();
216
		//		XPath xpath = xfactory.newXPath();
217
		//		xpath.setNamespaceContext(new PersonalNamespaceContext());
218
		//
219
		//		XPathExpression expr = xpath.compile(query);
220
		//		Object result = expr.evaluate(doc, XPathConstants.NODESET);
221
		//
222
		//		NodeList nodes = (NodeList) result;
223
		//		println "size: "+result.getLength()
224
		//		for (int i = 0; i < nodes.getLength(); i++) {
225
		//			return (nodes.item(i).getNodeValue());
226
		//		}
221 227
	}
222 228

  
223 229
	/**
......
227 233
	 */
228 234
	public static void main(String[] args) {
229 235
		println("main of XPathResult")
230
		File xmlFile = new File("/home/mdecorde/TXM/corpora/superphenix/txm/SUPERPHENIX/CreysSuper_04_0175.xml")
231
		String value = XPathResult.getXpathResponse(xmlFile, "TEI/text/@date");
232
		println "value: $value"
236
		File xmlFile = new File("/home/mdecorde/runtime-rcpapplication.product/corpora/DEMOCRATLYON3/txm/DEMOCRATLYON3/DiderotEssais.xml")
237
		XPathResult processor = new XPathResult(xmlFile)
238
		for (def node : processor.getNodes(xmlFile, "TEI/text/@date")) {
239
			println "node: $node"
240
		}
233 241
		//value = XPathResult.getXpathResponse(new File("/home/mdecorde/xml/bfm/strasb.xml"), "tei:TEI/tei:teiHeader/tei:revisionDesc/tei:change[contains(.,'étiquetage morpho')]");
234 242
		//println "value: $value"
235 243
	}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/corpus/Metadata2TeiHeaderMacro.groovy (revision 1972)
7 7
import groovy.transform.Field
8 8

  
9 9
import org.txm.importer.StaxIdentityParser
10
import org.txm.objects.*
10 11
import org.txm.rcp.swt.widget.parameters.*
11 12
import org.txm.searchengine.cqp.corpus.*
12 13
import org.txm.utils.CsvReader
13 14
import org.txm.utils.io.IOUtils
15
import org.txm.scripts.importer.XPathResult
16
import javax.xml.xpath.XPathConstants
14 17

  
15
if (!(corpusViewSelection instanceof CQPCorpus)) {
16
	println "** Corpora selection is not a Corpus. Aborting."
17
	return;
18
}
18
@Field @Option(name="defaultTeiHeaderFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="defaultTeiHeaderFile.xml")
19
def defaultTeiHeaderFile
19 20

  
20
@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="file.xml")
21
@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
21 22
def xpathFile
22 23

  
23
@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="file.xml")
24
@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
24 25
def metadataFile
25 26

  
26 27
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF	ON	ALL	REALLY ALL", required=true, def="OFF")
......
30 31
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
31 32

  
32 33

  
33
def corpus = corpusViewSelection
34
def project = corpus.getProject()
35 34

  
36
def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
37

  
38
if (!txmDir.exists()) {
39
	println "** the selected corpus has no XML-TXM files. Aborting."
40
}
41

  
42 35
def xpathProperties = new Properties()
43 36
xpathProperties.load(IOUtils.getReader(xpathFile))
44 37
println xpathProperties
......
52 45
	println "** 'id' column not found in $metadataFile"
53 46
	return;
54 47
}
48

  
49
def corpora = [:]
50
for (Project project : Workspace.getInstance().getProjects()) {
51
	for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
52
		corpora[corpus.getID()] = corpus;
53
	}
54
}
55

  
55 56
while (csvReader.readRecord())	{
56 57
	String text_id = csvReader.get("id")
57 58
	if (text_id == null || text_id.length() == 0) {
58
		println "** @id not found for record="+csvReader.getRawRecord()
59
		//println "** @id not found for record="+csvReader.getRawRecord()
59 60
		continue;
60 61
	}
62
	
63
	String corpus_id = csvReader.get("corpus_id")
64
	if (corpus_id == null || corpus_id.length() == 0) {
65
		println "** @corpus_id not found for record="+csvReader.getRawRecord()
66
		continue;
67
	}
68
	
69
	MainCorpus corpus = corpora[corpus_id]
70
	if (corpus == null) {
71
		println "** no corpus found for ID=$corpus_id"
72
		continue;
73
	}
74
	def project = corpus.getProject()
75
	
76
	def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
77
	
78
	if (!txmDir.exists()) {
79
		println "** the selected corpus has no XML-TXM files. Aborting."
80
	}
81
	
82
	
61 83
	File txmFile = new File(txmDir, text_id+".xml")
62 84
	if (txmFile.exists()) {
63 85
		
64 86
		def data = [:]
65 87
		for (def h : header) data[h] = csvReader.get(h)
66 88
			
67
		injecting(txmFile, data, xpathProperties)
89
		String xmlteiHeaderContent = getCustomizedTEiHeader(defaultTeiHeaderFile, data, xpathProperties);
90
		if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
91
			injecting(txmFile, xmlteiHeaderContent)
92
		} else {
93
			println "** Text header not updated: $txmFile"
94
		}
68 95
	} else {
69 96
		println "** Text not found: $txmFile"
70 97
	}
71 98
}
72 99

  
73
def injecting(File txmFile, def data, def paths) {
100
def getCustomizedTEiHeader(File defaultTeiHeaderFile, def data, Properties xpathProperties) {
101
	XPathResult xpathProcessor = new XPathResult(defaultTeiHeaderFile);
102
	for (String info : data.keySet()) {
103
		String xpath = xpathProperties[info];
104
		String value = data[info]
105
		if (xpath == null) {
106
			continue; // not a data to inject
107
		}
108
		if (debug > 1) println "   injecting '$info'='$value' in '$xpath'"
109
		
110
def expr = xpathProcessor.xpath.compile(xpath);
111
def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE);
112
		if (first != null) {
113
			switch (first.getNodeType()) {
114
				case 1: // element
115
					if (debug > 1) println "   patching: $info with "+value
116
					def newChild = xpathProcessor.doc.createElement(first.getTagName())
117
					newChild.appendChild(xpathProcessor.doc.createTextNode(value))
118
					first.getParentNode().replaceChild(newChild, first)
119
					break;
120
				case 2: // attribute
121
					if (debug > 1) println "   patching attribute: $info with "+value
122
					first.setNodeValue(value)
123
					break;
124
				case 3: // text
125
					if (debug > 1) println "   patching text: $info with "+value
126
					first.setNodeValue(value)
127
					break;
128
				default:
129
					break
130
			}
131
		} else {
132
			println "** not found $xpath"
133
		}
134
	}
135
	
136
//	return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
137
	//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
138
	String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0)
139
	content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
140
}
141

  
142
/**
143
 * Replace the teiHeader 
144
 * @param txmFile
145
 * @param data
146
 * @param paths
147
 * @param xmlteiHeaderContent
148
 * @return
149
 */
150
def injecting(File txmFile, String xmlteiHeaderContent) {
74 151
	println "Editing: $txmFile..."
75 152
	StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
76
		String currentXPath = ""
77 153
		boolean start = false;
78
		def foundPaths = [:]
79 154
		public void processStartElement() {
80
			super.processStartElement();
81 155
			
82
			if (localname == "teiHeader") start = true;
83
			if (start) {
84
				currentXPath += "/"+localname 
85
				def attributes = [:]
86
				for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
87
					attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i);
88
				}
89
				if (attributes.size() > 0)
90
					println currentXPath+attributes
91
				else 
92
					println currentXPath
156
			if (localname == "teiHeader") {
157
				start = true;
158
				if (debug > 2) println "   replacing teiHeader"
159
				if (debug > 3) println "       with $xmlteiHeaderContent"
160
				output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8")));  // REPLACE CONTENT !
93 161
			}
162
			if (!start) {
163
				super.processStartElement();
164
			}
94 165
		}
95 166
		
96 167
		public void processEndElement() {
97
			super.processEndElement();
98
			if (localname == "teiHeader") start = false;
99
			if (start) {
100
				currentXPath = currentXPath.substring(0, currentXPath.lastIndexOf("/"))
168
			if (!start) {
169
				super.processEndElement();
101 170
			}
171
			if (localname == "teiHeader") {
172
				start = false;
173
				if (debug > 2) println "   replace done"
174
			}
102 175
		}
176
		
177
		public void processCharacters() {
178
			if (!start) {
179
				super.processCharacters();
180
			}
181
		}
182
		
183
		public void processComment() {
184
			if (!start) {
185
				super.processComment();
186
			}
187
		}
103 188
	}
104
	File outfile = File.createTempFile("txm", txmFile.getName())
105 189
	
106
	boolean ret = sparser.process(outfile);
107
	if (ret) { // replace inputFile
190
	File tmpDirectory = new File("/home/mdecorde/TEMP");
191
	File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName())
192
	File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName())
193
	
194
	if (sparser.process(outfile)) { // replace inputFile
195
		if (txmFile.renameTo(copyFile)) {
196
			if (outfile.renameTo(txmFile)) {
197
				println " -> SUCCESS see $txmFile"
198
				return true
199
			} else {
200
				println " -> FAIL could not replace $txmFile"
201
				println " -> see result in $outfile"
202
				
203
			}
204
		} else {
205
			println " -> FAIL could not make a copy of $txmFile in $copyFile"
206
			println " -> see result in $outfile"
207
		}
108 208
		
209
	} else {
210
		println " -> FAIL see $outfile"
109 211
	}
110 212
	
111
	return ret;		
213
	return false;		
112 214
}

Formats disponibles : Unified diff