root / tmp / org.txm.analec.rcp / src / org / txm / macro / urs / democrat / Metadata2TeiHeaderMacro.groovy @ 2027
History | View | Annotate | Download (7 kB)
1 |
// STANDARD DECLARATIONS
|
---|---|
2 |
package org.txm.macro.urs.democrat
|
3 |
|
4 |
import java.nio.charset.Charset |
5 |
|
6 |
import org.kohsuke.args4j.* |
7 |
import groovy.transform.Field |
8 |
|
9 |
import org.txm.importer.StaxIdentityParser |
10 |
import org.txm.objects.* |
11 |
import org.txm.rcp.swt.widget.parameters.* |
12 |
import org.txm.searchengine.cqp.corpus.* |
13 |
import org.txm.utils.CsvReader |
14 |
import org.txm.utils.io.IOUtils |
15 |
import org.txm.scripts.importer.XPathResult |
16 |
import javax.xml.xpath.XPathConstants |
17 |
|
18 |
class Metadata2TEiHeaderMacro { |
19 |
|
20 |
int debug = 0; |
21 |
public Metadata2TEiHeaderMacro(int debug) { |
22 |
this.debug = debug
|
23 |
} |
24 |
//@Field @Option(name="teiHeaderTemplateFile", usage="the default teiHeader of texts", widget="FileOpen", required=true, def="teiHeaderTemplateFile.xml")
|
25 |
//def teiHeaderTemplateFile
|
26 |
//
|
27 |
//@Field @Option(name="xpathFile", usage="properties file to redirect metadata column to the teiHeader locations", widget="FileOpen", required=true, def="xpathFile.properties")
|
28 |
//def xpathFile
|
29 |
//
|
30 |
//@Field @Option(name="metadataFile", usage="the TSV file containing the metadata values per text", widget="FileOpen", required=true, def="metadataFile.tsv")
|
31 |
//def metadataFile
|
32 |
//
|
33 |
//@Field @Option(name="debug", usage="Show internal variable content", widget="StringArray", metaVar="OFF ON ALL REALLY ALL", required=true, def="OFF")
|
34 |
//debug
|
35 |
//
|
36 |
//if (!ParametersDialog.open(this)) return;
|
37 |
//if (debug == "OFF") debug = 0; else if (debug == "ON") debug = 1; else if (debug == "ALL") debug = 2 else if (debug == "REALLY ALL") debug = 3
|
38 |
//
|
39 |
//
|
40 |
//
|
41 |
//def xpathProperties = new Properties()
|
42 |
//xpathProperties.load(IOUtils.getReader(xpathFile))
|
43 |
//println xpathProperties
|
44 |
//
|
45 |
//def csvReader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
|
46 |
//println csvReader
|
47 |
//
|
48 |
//csvReader.readHeaders();
|
49 |
//def header = csvReader.getHeaders()
|
50 |
//if (!header.contains("id")) {
|
51 |
// println "** 'id' column not found in $metadataFile header=$header"
|
52 |
// return;
|
53 |
//}
|
54 |
//if (!header.contains("corpus_id")) {
|
55 |
// println "** 'corpus_id' column not found in $metadataFile header=$header"
|
56 |
// return;
|
57 |
//}
|
58 |
//
|
59 |
//def corpora = [:]
|
60 |
//for (Project project : Workspace.getInstance().getProjects()) {
|
61 |
// for (MainCorpus corpus : project.getChildren(MainCorpus.class)) {
|
62 |
// corpora[corpus.getID()] = corpus;
|
63 |
// }
|
64 |
//}
|
65 |
//
|
66 |
//while (csvReader.readRecord()) {
|
67 |
// String text_id = csvReader.get("id")
|
68 |
// if (text_id == null || text_id.length() == 0) {
|
69 |
// //println "** @id not found for record="+csvReader.getRawRecord()
|
70 |
// continue;
|
71 |
// }
|
72 |
//
|
73 |
// String corpus_id = csvReader.get("corpus_id")
|
74 |
// if (corpus_id == null || corpus_id.length() == 0) {
|
75 |
// println "** @corpus_id not found for record="+csvReader.getRawRecord()
|
76 |
// continue;
|
77 |
// }
|
78 |
//
|
79 |
// MainCorpus corpus = corpora[corpus_id]
|
80 |
// if (corpus == null) {
|
81 |
// println "** no corpus found for ID=$corpus_id"
|
82 |
// continue;
|
83 |
// }
|
84 |
// def project = corpus.getProject()
|
85 |
//
|
86 |
// def txmDir = new File(project.getProjectDirectory(), "txm/"+corpus.getID())
|
87 |
//
|
88 |
// if (!txmDir.exists()) {
|
89 |
// println "** the selected corpus has no XML-TXM files. Aborting."
|
90 |
// }
|
91 |
//
|
92 |
//
|
93 |
// File txmFile = new File(txmDir, text_id+".xml")
|
94 |
// if (txmFile.exists()) {
|
95 |
//
|
96 |
// def data = [:]
|
97 |
//
|
98 |
// for (def h : header) data[h] = csvReader.get(h)
|
99 |
//
|
100 |
// String xmlteiHeaderContent = getCustomizedTEiHeader(teiHeaderTemplateFile, data, xpathProperties);
|
101 |
// if (xmlteiHeaderContent != null && xmlteiHeaderContent.length() > 0) {
|
102 |
// injecting(txmFile, xmlteiHeaderContent)
|
103 |
// } else {
|
104 |
// println "** Text header not updated: $txmFile"
|
105 |
// }
|
106 |
// } else {
|
107 |
// println "** Text not found: $txmFile"
|
108 |
// }
|
109 |
//}
|
110 |
|
111 |
def getCustomizedTEiHeader(File teiHeaderTemplateFile, def data, Properties xpathProperties) { |
112 |
XPathResult xpathProcessor = new XPathResult(teiHeaderTemplateFile);
|
113 |
for (String info : data.keySet()) { |
114 |
String xpath = xpathProperties[info];
|
115 |
String value = data[info]
|
116 |
if (xpath == null) { |
117 |
continue; // not a data to inject |
118 |
} |
119 |
if (debug > 1) println " injecting '$info'='$value' in '$xpath'" |
120 |
|
121 |
def expr = xpathProcessor.xpath.compile(xpath);
|
122 |
def first = expr.evaluate(xpathProcessor.doc.getDocumentElement(), XPathConstants.NODE); |
123 |
if (first != null) { |
124 |
switch (first.getNodeType()) {
|
125 |
case 1: // element |
126 |
if (debug > 1) println " patching: $info with "+value |
127 |
def newChild = xpathProcessor.doc.createElement(first.getTagName())
|
128 |
newChild.appendChild(xpathProcessor.doc.createTextNode(value)) |
129 |
first.getParentNode().replaceChild(newChild, first) |
130 |
break;
|
131 |
case 2: // attribute |
132 |
if (debug > 1) println " patching attribute: $info with "+value |
133 |
first.setNodeValue(value) |
134 |
break;
|
135 |
case 3: // text |
136 |
if (debug > 1) println " patching text: $info with "+value |
137 |
first.setNodeValue(value) |
138 |
break;
|
139 |
default:
|
140 |
break
|
141 |
} |
142 |
} else {
|
143 |
println "** not found $xpath"
|
144 |
} |
145 |
} |
146 |
|
147 |
// return xpathProcessor.doc.getElementsByTagName("teiHeader").item(0).getNodeValue();
|
148 |
//println "DOC="+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0).toString()
|
149 |
String content = ""+xpathProcessor.doc.getDocumentElement().getElementsByTagName("teiHeader").item(0) |
150 |
content = content.substring('<?xml version="1.0" encoding="UTF-8"?>'.length())
|
151 |
} |
152 |
|
153 |
/**
|
154 |
* Replace the teiHeader
|
155 |
* @param txmFile
|
156 |
* @param data
|
157 |
* @param paths
|
158 |
* @param xmlteiHeaderContent
|
159 |
* @return
|
160 |
*/
|
161 |
def injecting(File txmFile, String xmlteiHeaderContent) { |
162 |
println "Editing: $txmFile..."
|
163 |
StaxIdentityParser sparser = new StaxIdentityParser(txmFile) {
|
164 |
boolean start = false; |
165 |
public void processStartElement() { |
166 |
|
167 |
if (localname == "teiHeader") { |
168 |
start = true;
|
169 |
if (debug > 2) println " replacing teiHeader" |
170 |
if (debug > 3) println " with $xmlteiHeaderContent" |
171 |
output.write(xmlteiHeaderContent.getBytes(Charset.forName("UTF-8"))); // REPLACE CONTENT ! |
172 |
} |
173 |
if (!start) {
|
174 |
super.processStartElement();
|
175 |
} |
176 |
} |
177 |
|
178 |
public void processEndElement() { |
179 |
if (!start) {
|
180 |
super.processEndElement();
|
181 |
} |
182 |
if (localname == "teiHeader") { |
183 |
start = false;
|
184 |
if (debug > 2) println " replace done" |
185 |
} |
186 |
} |
187 |
|
188 |
public void processCharacters() { |
189 |
if (!start) {
|
190 |
super.processCharacters();
|
191 |
} |
192 |
} |
193 |
|
194 |
public void processComment() { |
195 |
if (!start) {
|
196 |
super.processComment();
|
197 |
} |
198 |
} |
199 |
} |
200 |
|
201 |
File tmpDirectory = new File("/home/mdecorde/TEMP"); |
202 |
File outfile = new File(tmpDirectory, "tmp_"+txmFile.getName()) |
203 |
File copyFile = new File(tmpDirectory, "copy_"+txmFile.getName()) |
204 |
|
205 |
if (sparser.process(outfile)) { // replace inputFile |
206 |
if (txmFile.renameTo(copyFile)) {
|
207 |
if (outfile.renameTo(txmFile)) {
|
208 |
println " -> SUCCESS see $txmFile"
|
209 |
return true |
210 |
} else {
|
211 |
println " -> FAIL could not replace $txmFile"
|
212 |
println " -> see result in $outfile"
|
213 |
|
214 |
} |
215 |
} else {
|
216 |
println " -> FAIL could not make a copy of $txmFile in $copyFile"
|
217 |
println " -> see result in $outfile"
|
218 |
} |
219 |
|
220 |
} else {
|
221 |
println " -> FAIL see $outfile"
|
222 |
} |
223 |
|
224 |
return false; |
225 |
} |
226 |
} |