Révision 1972
tmp/org.txm.core/src/java/org/txm/scripts/importer/XPathResult.groovy (revision 1972) | ||
---|---|---|
2 | 2 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of |
3 | 3 |
// Lyon 2, University of Franche-Comté, University of Nice |
4 | 4 |
// Sophia Antipolis, University of Paris 3. |
5 |
//
|
|
5 |
// |
|
6 | 6 |
// The TXM platform is free software: you can redistribute it |
7 | 7 |
// and/or modify it under the terms of the GNU General Public |
8 | 8 |
// License as published by the Free Software Foundation, |
9 | 9 |
// either version 2 of the License, or (at your option) any |
10 | 10 |
// later version. |
11 |
//
|
|
11 |
// |
|
12 | 12 |
// The TXM platform is distributed in the hope that it will be |
13 | 13 |
// useful, but WITHOUT ANY WARRANTY; without even the implied |
14 | 14 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
15 | 15 |
// PURPOSE. See the GNU General Public License for more |
16 | 16 |
// details. |
17 |
//
|
|
17 |
// |
|
18 | 18 |
// You should have received a copy of the GNU General |
19 | 19 |
// Public License along with the TXM platform. If not, see |
20 | 20 |
// http://www.gnu.org/licenses. |
21 |
//
|
|
22 |
//
|
|
23 |
//
|
|
21 |
// |
|
22 |
// |
|
23 |
// |
|
24 | 24 |
// $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $ |
25 | 25 |
// $LastChangedRevision: 3426 $ |
26 |
// $LastChangedBy: mdecorde $
|
|
26 |
// $LastChangedBy: mdecorde $ |
|
27 | 27 |
// |
28 | 28 |
package org.txm.scripts.importer; |
29 | 29 |
|
... | ... | |
51 | 51 |
* return the id of a bfm tag <milestone/> |
52 | 52 |
*/ |
53 | 53 |
public class XPathResult { |
54 |
|
|
54 |
|
|
55 | 55 |
/** The doc. */ |
56 | 56 |
Document doc; |
57 | 57 |
XPath xpath; |
... | ... | |
64 | 64 |
public XPathResult(File xmlfile) { |
65 | 65 |
this(xmlfile, true) |
66 | 66 |
} |
67 |
|
|
67 |
|
|
68 | 68 |
/** |
69 | 69 |
* Instantiates a new x path result. |
70 | 70 |
* |
... | ... | |
74 | 74 |
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
75 | 75 |
factory.setXIncludeAware(true); |
76 | 76 |
factory.setNamespaceAware(namespaceAware); // never forget this! |
77 |
|
|
77 |
|
|
78 | 78 |
DocumentBuilder builder = factory.newDocumentBuilder(); |
79 | 79 |
doc = builder.parse(xmlfile); |
80 |
|
|
80 |
|
|
81 | 81 |
XPathFactory xfactory = XPathFactory.newInstance(); |
82 | 82 |
xpath = xfactory.newXPath(); |
83 | 83 |
xpath.setNamespaceContext(new PersonalNamespaceContext()); |
84 | 84 |
} |
85 |
|
|
85 |
|
|
86 |
public Document getDocument() { |
|
87 |
return doc; |
|
88 |
} |
|
89 |
|
|
86 | 90 |
public def getNodes(String query) { |
87 | 91 |
def rnodes = []; |
88 |
|
|
92 |
|
|
89 | 93 |
XPathExpression expr = xpath.compile(query); |
90 | 94 |
Object result = expr.evaluate(doc.getDocumentElement(), XPathConstants.NODESET); |
91 |
|
|
92 |
NodeList nodes = (NodeList) result; |
|
93 |
for (int i = 0; i < nodes.getLength(); i++) { |
|
94 |
rnodes.add(nodes.item(i)); |
|
95 |
if (result instanceof NodeList) { |
|
96 |
NodeList nodes = (NodeList) result; |
|
97 |
for (int i = 0; i < nodes.getLength(); i++) { |
|
98 |
rnodes.add(nodes.item(i)); |
|
99 |
} |
|
100 |
} else { |
|
101 |
result.getClass() |
|
102 |
rnodes << result |
|
95 | 103 |
} |
96 | 104 |
return rnodes; |
97 | 105 |
} |
98 |
|
|
106 |
|
|
99 | 107 |
public String getXpathResponse(String query) { |
100 | 108 |
XPathExpression expr = xpath.compile(query); |
101 | 109 |
Object result = expr.evaluate(doc.getDocumentElement(), XPathConstants.NODESET); |
102 |
|
|
110 |
|
|
103 | 111 |
NodeList nodes = (NodeList) result; |
104 | 112 |
for (int i = 0; i < nodes.getLength(); i++) { |
105 | 113 |
//println nodes.item(i) |
106 | 114 |
return (nodes.item(i).getNodeValue()); |
107 | 115 |
} |
108 | 116 |
} |
109 |
|
|
117 |
|
|
110 | 118 |
public ArrayList<String> getXpathResponses(String query) { |
111 | 119 |
ArrayList<String> xresult = new ArrayList<String>(); |
112 | 120 |
XPathExpression expr = xpath.compile(query); |
113 | 121 |
Object result = expr.evaluate(doc.getDocumentElement(), XPathConstants.NODESET); |
114 |
|
|
122 |
|
|
115 | 123 |
NodeList nodes = (NodeList) result; |
116 | 124 |
for (int i = 0; i < nodes.getLength(); i++) { |
117 | 125 |
//println nodes.item(i) |
... | ... | |
119 | 127 |
} |
120 | 128 |
return xresult |
121 | 129 |
} |
122 |
|
|
130 |
|
|
123 | 131 |
public String getXpathResponse(String query, String devaultValue) { |
124 | 132 |
String rez = getXpathResponse(query); |
125 | 133 |
if (rez == null) |
126 | 134 |
return devaultValue; |
127 | 135 |
return rez; |
128 | 136 |
} |
129 |
|
|
137 |
|
|
130 | 138 |
public void close() { |
131 | 139 |
xpath = null; |
132 | 140 |
doc = null; |
133 | 141 |
} |
134 | 142 |
|
135 | 143 |
/** |
136 |
|
|
137 | 144 |
* OBSOLETE VERSION FOR TXM return the node text content given a XPath |
138 |
|
|
139 | 145 |
* "//path.../.../@attr" |
140 | 146 |
* |
141 | 147 |
* @param path |
... | ... | |
164 | 170 |
* |
165 | 171 |
* } return ""; } |
166 | 172 |
*/ |
167 |
|
|
173 |
|
|
168 | 174 |
static public String getXpathResponse(File xmlfile, String query, String devaultValue) { |
169 | 175 |
String rez = getXpathResponse(xmlfile, query); |
170 | 176 |
if (rez == null) |
171 | 177 |
return devaultValue; |
172 | 178 |
return rez; |
173 | 179 |
} |
174 |
|
|
180 |
|
|
175 | 181 |
static public String getXpathResponse(File xmlfile, String query, String devaultValue, boolean namespaceAware) { |
176 | 182 |
String rez = getXpathResponse(xmlfile, query, namespaceAware); |
177 | 183 |
if (rez == null) |
... | ... | |
189 | 195 |
static public String getXpathResponse(File xmlfile, String query) { |
190 | 196 |
return getXpathResponse(xmlfile, query, true); |
191 | 197 |
} |
192 |
|
|
198 |
|
|
193 | 199 |
/** |
194 | 200 |
* Gets the xpath response. |
195 | 201 |
* |
... | ... | |
200 | 206 |
static public String getXpathResponse(File xmlfile, String query, boolean namespaceAware) { |
201 | 207 |
XPathResult result = new XPathResult(xmlfile); |
202 | 208 |
return result.getXpathResponse(query); |
203 |
// DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); |
|
204 |
// factory.setNamespaceAware(namespaceAware); // never forget this! |
|
205 |
//
|
|
206 |
// DocumentBuilder builder = factory.newDocumentBuilder(); |
|
207 |
// Document doc = builder.parse(xmlfile); |
|
208 |
//
|
|
209 |
// XPathFactory xfactory = XPathFactory.newInstance(); |
|
210 |
// XPath xpath = xfactory.newXPath(); |
|
211 |
// xpath.setNamespaceContext(new PersonalNamespaceContext()); |
|
212 |
//
|
|
213 |
// XPathExpression expr = xpath.compile(query); |
|
214 |
// Object result = expr.evaluate(doc, XPathConstants.NODESET); |
|
215 |
//
|
|
216 |
// NodeList nodes = (NodeList) result; |
|
217 |
// println "size: "+result.getLength() |
|
218 |
// for (int i = 0; i < nodes.getLength(); i++) { |
|
219 |
// return (nodes.item(i).getNodeValue()); |
|
220 |
// } |
|
209 |
// DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
|
210 |
// factory.setNamespaceAware(namespaceAware); // never forget this!
|
|
211 |
//
|
|
212 |
// DocumentBuilder builder = factory.newDocumentBuilder();
|
|
213 |
// Document doc = builder.parse(xmlfile);
|
|
214 |
//
|
|
215 |
// XPathFactory xfactory = XPathFactory.newInstance();
|
|
216 |
// XPath xpath = xfactory.newXPath();
|
|
217 |
// xpath.setNamespaceContext(new PersonalNamespaceContext());
|
|
218 |
//
|
|
219 |
// XPathExpression expr = xpath.compile(query);
|
|
220 |
// Object result = expr.evaluate(doc, XPathConstants.NODESET);
|
|
221 |
//
|
|
222 |
// NodeList nodes = (NodeList) result;
|
|
223 |
// println "size: "+result.getLength()
|
|
224 |
// for (int i = 0; i < nodes.getLength(); i++) {
|
|
225 |
// return (nodes.item(i).getNodeValue());
|
|
226 |
// }
|
|
221 | 227 |
} |
222 | 228 |
|
223 | 229 |
/** |
... | ... | |
227 | 233 |
*/ |
228 | 234 |
public static void main(String[] args) { |
229 | 235 |
println("main of XPathResult") |
230 |
File xmlFile = new File("/home/mdecorde/TXM/corpora/superphenix/txm/SUPERPHENIX/CreysSuper_04_0175.xml") |
|
231 |
String value = XPathResult.getXpathResponse(xmlFile, "TEI/text/@date"); |
|
232 |
println "value: $value" |
|
236 |
File xmlFile = new File("/home/mdecorde/runtime-rcpapplication.product/corpora/DEMOCRATLYON3/txm/DEMOCRATLYON3/DiderotEssais.xml") |
|
237 |
XPathResult processor = new XPathResult(xmlFile) |
|
238 |
for (def node : processor.getNodes(xmlFile, "TEI/text/@date")) { |
|
239 |
println "node: $node" |
|
240 |
} |
|
233 | 241 |
//value = XPathResult.getXpathResponse(new File("/home/mdecorde/xml/bfm/strasb.xml"), "tei:TEI/tei:teiHeader/tei:revisionDesc/tei:change[contains(.,'étiquetage morpho')]"); |
234 | 242 |
//println "value: $value" |
235 | 243 |
} |
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/corpus/Metadata2TeiHeaderMacro.groovy (revision 1972) | ||
---|---|---|
7 | 7 |
import groovy.transform.Field |
8 | 8 |
|
9 | 9 |
import org.txm.importer.StaxIdentityParser |
10 |
import org.txm.objects.* |
|
10 | 11 |
import org.txm.rcp.swt.widget.parameters.* |
11 | 12 |
import org.txm.searchengine.cqp.corpus.* |
12 | 13 |
import org.txm.utils.CsvReader |
13 | 14 |
import org.txm.utils.io.IOUtils |
15 |
import org.txm.scripts.importer.XPathResult |
|
16 |
import javax.xml.xpath.XPathConstants |
|
14 | 17 |
|
15 |
if (!(corpusViewSelection instanceof CQPCorpus)) { |
|
16 |
println "** Corpora selection is not a Corpus. Aborting." |
|
17 |
return; |
|
18 |
} |
|
18 |
@Field @Option(name="defaultTeiHeaderFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="defaultTeiHeaderFile.xml") |
|
19 |
def defaultTeiHeaderFile |
|
19 | 20 |
|
20 |
@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="file.xml")
|
|
21 |
@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
|
|
21 | 22 |
def xpathFile |
22 | 23 |
|
23 |
@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="file.xml")
|
|
24 |
@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
|
|
24 | 25 |
def metadataFile |
25 | 26 |
|
26 | 27 |
@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF") |
... | ... | |
30 | 31 |
if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3 |
31 | 32 |
|
32 | 33 |
|
33 |
def corpus = corpusViewSelection |
|
34 |
def project = corpus.getProject() |
|
35 | 34 |
|
36 |
def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID()) |
|
37 |
|
|
38 |
if (!txmDir.exists()) { |
|
39 |
println "** the selected corpus has no XML-TXM files. Aborting." |
|
40 |
} |
|
41 |
|
|
42 | 35 |
def xpathProperties = new Properties() |
43 | 36 |
xpathProperties.load(IOUtils.getReader(xpathFile)) |
44 | 37 |
println xpathProperties |
... | ... | |
52 | 45 |
println "** 'id' column not found in $metadataFile" |
53 | 46 |
return; |
54 | 47 |
} |
48 |
|
|
49 |
def corpora = [:] |
|
50 |
for (Project project : Workspace.getInstance().getProjects()) { |
|
51 |
for (MainCorpus corpus : project.getChildren(MainCorpus.class)) { |
|
52 |
corpora[corpus.getID()] = corpus; |
|
53 |
} |
|
54 |
} |
|
55 |
|
|
55 | 56 |
while (csvReader.readRecord()) { |
56 | 57 |
String text_id = csvReader.get("id") |
57 | 58 |
if (text_id == null || text_id.length() == 0) { |
58 |
println "** @id not found for record="+csvReader.getRawRecord() |
|
59 |
//println "** @id not found for record="+csvReader.getRawRecord()
|
|
59 | 60 |
continue; |
60 | 61 |
} |
62 |
|
|
63 |
String corpus_id = csvReader.get("corpus_id") |
|
64 |
if (corpus_id == null || corpus_id.length() == 0) { |
|
65 |
println "** @corpus_id not found for record="+csvReader.getRawRecord() |
|
66 |
continue; |
|
67 |
} |
|
68 |
|
|
69 |
MainCorpus corpus = corpora[corpus_id] |
|
70 |
if (corpus == null) { |
|
71 |
println "** no corpus found for ID=$corpus_id" |
|
72 |
continue; |
|
73 |
} |
|
74 |
def project = corpus.getProject() |
|
75 |
|
|
76 |
def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID()) |
|
77 |
|
|
78 |
if (!txmDir.exists()) { |
|
79 |
println "** the selected corpus has no XML-TXM files. Aborting." |
|
80 |
} |
|
81 |
|
|
82 |
|
|
61 | 83 |
File txmFile = new File(txmDir, text_id+".xml") |
62 | 84 |
if (txmFile.exists()) { |
63 | 85 |
|
64 | 86 |
def data = [:] |
65 | 87 |
for (def h : header) data[h] = csvReader.get(h) |
66 | 88 |
|
67 |
injecting(txmFile, data, xpathProperties) |
|
89 |
String xmlteiHeaderContent = getCustomizedTEiHeader(defaultTeiHeaderFile, data, xpathProperties); |
|
90 |
if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) { |
|
91 |
injecting(txmFile, xmlteiHeaderContent) |
|
92 |
} else { |
|
93 |
println "** Text header not updated: $txmFile" |
|
94 |
} |
|
68 | 95 |
} else { |
69 | 96 |
println "** Text not found: $txmFile" |
70 | 97 |
} |
71 | 98 |
} |
72 | 99 |
|
73 |
def injecting(File txmFile, def data, def paths) { |
|
100 |
def getCustomizedTEiHeader(File defaultTeiHeaderFile, def data, Properties xpathProperties) { |
|
101 |
XPathResult xpathProcessor = new XPathResult(defaultTeiHeaderFile); |
|
102 |
for (String info : data.keySet()) { |
|
103 |
String xpath = xpathProperties[info]; |
|
104 |
String value = data[info] |
|
105 |
if (xpath == null) { |
|
106 |
continue; // not a data to inject |
|
107 |
} |
|
108 |
if (debug > 1) println " injecting '$info'='$value' in '$xpath'" |
|
109 |
|
|
110 |
def expr = xpathProcessor.xpath.compile(xpath); |
|
111 |
def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE); |
|
112 |
if (first != null) { |
|
113 |
switch (first.getNodeType()) { |
|
114 |
case 1: // element |
|
115 |
if (debug > 1) println " patching: $info with "+value |
|
116 |
def newChild = xpathProcessor.doc.createElement(first.getTagName()) |
|
117 |
newChild.appendChild(xpathProcessor.doc.createTextNode(value)) |
|
118 |
first.getParentNode().replaceChild(newChild, first) |
|
119 |
break; |
|
120 |
case 2: // attribute |
|
121 |
if (debug > 1) println " patching attribute: $info with "+value |
|
122 |
first.setNodeValue(value) |
|
123 |
break; |
|
124 |
case 3: // text |
|
125 |
if (debug > 1) println " patching text: $info with "+value |
|
126 |
first.setNodeValue(value) |
|
127 |
break; |
|
128 |
default: |
|
129 |
break |
|
130 |
} |
|
131 |
} else { |
|
132 |
println "** not found $xpath" |
|
133 |
} |
|
134 |
} |
|
135 |
|
|
136 |
// return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue(); |
|
137 |
//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString() |
|
138 |
String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0) |
|
139 |
content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length()) |
|
140 |
} |
|
141 |
|
|
142 |
/** |
|
143 |
* Replace the teiHeader |
|
144 |
* @param txmFile |
|
145 |
* @param data |
|
146 |
* @param paths |
|
147 |
* @param xmlteiHeaderContent |
|
148 |
* @return |
|
149 |
*/ |
|
150 |
def injecting(File txmFile, String xmlteiHeaderContent) { |
|
74 | 151 |
println "Editing: $txmFile..." |
75 | 152 |
StaxIdentityParser sparser = new StaxIdentityParser(txmFile) { |
76 |
String currentXPath = "" |
|
77 | 153 |
boolean start = false; |
78 |
def foundPaths = [:] |
|
79 | 154 |
public void processStartElement() { |
80 |
super.processStartElement(); |
|
81 | 155 |
|
82 |
if (localname == "teiHeader") start = true; |
|
83 |
if (start) { |
|
84 |
currentXPath += "/"+localname |
|
85 |
def attributes = [:] |
|
86 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
87 |
attributes[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i); |
|
88 |
} |
|
89 |
if (attributes.size() > 0) |
|
90 |
println currentXPath+attributes |
|
91 |
else |
|
92 |
println currentXPath |
|
156 |
if (localname == "teiHeader") { |
|
157 |
start = true; |
|
158 |
if (debug > 2) println " replacing teiHeader" |
|
159 |
if (debug > 3) println " with $xmlteiHeaderContent" |
|
160 |
output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8"))); // REPLACE CONTENT ! |
|
93 | 161 |
} |
162 |
if (!start) { |
|
163 |
super.processStartElement(); |
|
164 |
} |
|
94 | 165 |
} |
95 | 166 |
|
96 | 167 |
public void processEndElement() { |
97 |
super.processEndElement(); |
|
98 |
if (localname == "teiHeader") start = false; |
|
99 |
if (start) { |
|
100 |
currentXPath = currentXPath.substring(0, currentXPath.lastIndexOf("/")) |
|
168 |
if (!start) { |
|
169 |
super.processEndElement(); |
|
101 | 170 |
} |
171 |
if (localname == "teiHeader") { |
|
172 |
start = false; |
|
173 |
if (debug > 2) println " replace done" |
|
174 |
} |
|
102 | 175 |
} |
176 |
|
|
177 |
public void processCharacters() { |
|
178 |
if (!start) { |
|
179 |
super.processCharacters(); |
|
180 |
} |
|
181 |
} |
|
182 |
|
|
183 |
public void processComment() { |
|
184 |
if (!start) { |
|
185 |
super.processComment(); |
|
186 |
} |
|
187 |
} |
|
103 | 188 |
} |
104 |
File outfile = File.createTempFile("txm", txmFile.getName()) |
|
105 | 189 |
|
106 |
boolean ret = sparser.process(outfile); |
|
107 |
if (ret) { // replace inputFile |
|
190 |
File tmpDirectory = new File("/home/mdecorde/TEMP"); |
|
191 |
File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName()) |
|
192 |
File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName()) |
|
193 |
|
|
194 |
if (sparser.process(outfile)) { // replace inputFile |
|
195 |
if (txmFile.renameTo(copyFile)) { |
|
196 |
if (outfile.renameTo(txmFile)) { |
|
197 |
println " -> SUCCESS see $txmFile" |
|
198 |
return true |
|
199 |
} else { |
|
200 |
println " -> FAIL could not replace $txmFile" |
|
201 |
println " -> see result in $outfile" |
|
202 |
|
|
203 |
} |
|
204 |
} else { |
|
205 |
println " -> FAIL could not make a copy of $txmFile in $copyFile" |
|
206 |
println " -> see result in $outfile" |
|
207 |
} |
|
108 | 208 |
|
209 |
} else { |
|
210 |
println " -> FAIL see $outfile" |
|
109 | 211 |
} |
110 | 212 |
|
111 |
return ret;
|
|
213 |
return false;
|
|
112 | 214 |
} |
Formats disponibles : Unified diff