root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / AnnotationInjection.groovy @ 1681
History | View | Annotate | Download (12.7 kB)
1 |
|
---|---|
2 |
|
3 |
// Copyright © 2010-2013 ENS de Lyon.
|
4 |
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
5 |
// Lyon 2, University of Franche-Comté, University of Nice
|
6 |
// Sophia Antipolis, University of Paris 3.
|
7 |
//
|
8 |
// The TXM platform is free software: you can redistribute it
|
9 |
// and/or modify it under the terms of the GNU General Public
|
10 |
// License as published by the Free Software Foundation,
|
11 |
// either version 2 of the License, or (at your option) any
|
12 |
// later version.
|
13 |
//
|
14 |
// The TXM platform is distributed in the hope that it will be
|
15 |
// useful, but WITHOUT ANY WARRANTY; without even the implied
|
16 |
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
17 |
// PURPOSE. See the GNU General Public License for more
|
18 |
// details.
|
19 |
//
|
20 |
// You should have received a copy of the GNU General
|
21 |
// Public License along with the TXM platform. If not, see
|
22 |
// http://www.gnu.org/licenses.
|
23 |
//
|
24 |
//
|
25 |
//
|
26 |
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
|
27 |
// $LastChangedRevision: 3185 $
|
28 |
// $LastChangedBy: mdecorde $
|
29 |
//
|
30 |
package org.txm.importer.scripts.xmltxm
|
31 |
|
32 |
import javax.xml.stream.* |
33 |
|
34 |
import org.txm.importer.StaxIdentityParser |
35 |
import org.txm.importer.filters.* |
36 |
import org.txm.importer.PersonalNamespaceContext |
37 |
import java.io.File |
38 |
import java.io.IOException |
39 |
|
40 |
import javax.xml.stream.XMLStreamException |
41 |
|
42 |
/**
|
43 |
* The Class AnnotationInjection.
|
44 |
*
|
45 |
* @author mdecorde
|
46 |
*
|
47 |
* inject annotation from a stand-off file into a xml-tei-txm
|
48 |
* file
|
49 |
*/
|
50 |
|
51 |
public class AnnotationInjection extends StaxIdentityParser { |
52 |
|
53 |
public static String TXMNS = "http://textometrie.org/1.0" |
54 |
|
55 |
/** The xml reader factory. */
|
56 |
private def factory; |
57 |
|
58 |
/** The links. */
|
59 |
private LinkedHashSet<String> links; |
60 |
boolean replace
|
61 |
|
62 |
/** The linkparsers. key=type*/
|
63 |
private LinkedHashMap<String, XMLStreamReader> linkparsers; |
64 |
|
65 |
/** The anaurl. */
|
66 |
private def anaurl; |
67 |
|
68 |
/** The anainput data. */
|
69 |
private def anainputData; |
70 |
|
71 |
/** The anafactory. */
|
72 |
private XMLInputFactory anafactory = XMLInputFactory.newInstance();
|
73 |
|
74 |
/** The anaparser. */
|
75 |
private XMLStreamReader anaparser;
|
76 |
private XMLStreamReader headerparser;
|
77 |
|
78 |
/** The resp stmt id. */
|
79 |
String respStmtID = ""; |
80 |
|
81 |
/** The present taxonomies. */
|
82 |
ArrayList<String> presentTaxonomies = new ArrayList(); |
83 |
|
84 |
/**
|
85 |
* Instantiates a new annotation injection.
|
86 |
*
|
87 |
* @param url the xml-tei-txm file
|
88 |
* @param anaurl the stand-off file
|
89 |
*/
|
90 |
public AnnotationInjection(URL url, URL anaurl) { |
91 |
this(url, anaurl, false) |
92 |
} |
93 |
|
94 |
/**
|
95 |
* Instantiates a new annotation injection.
|
96 |
*
|
97 |
* @param url the xml-tei-txm file
|
98 |
* @param anaurl the stand-off file
|
99 |
*/
|
100 |
public AnnotationInjection(URL url, URL anaurl, boolean replace) { |
101 |
super(url); // init reader and writer |
102 |
try {
|
103 |
this.anaurl = anaurl;
|
104 |
this.replace = replace
|
105 |
factory = XMLInputFactory.newInstance(); |
106 |
this.buildLinkParsers();// build a parser per linkgroup |
107 |
} catch (XMLStreamException ex) {
|
108 |
System.out.println(ex);
|
109 |
} catch (IOException ex) { |
110 |
System.out.println("IOException while parsing "); |
111 |
} |
112 |
} |
113 |
|
114 |
private void getHeaderInfos(String containertag, boolean captureTheTag) |
115 |
{ |
116 |
anainputData = new BufferedInputStream(anaurl.openStream()); |
117 |
headerparser = anafactory.createXMLStreamReader(anainputData); |
118 |
boolean start = false; |
119 |
String localname;
|
120 |
for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next()) |
121 |
{ |
122 |
//String prefix = headerparser.getNamespaceURI();
|
123 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
124 |
localname = headerparser.getLocalName(); |
125 |
if (captureTheTag && localname == containertag) // start copy after the tag |
126 |
start = true;
|
127 |
if (start) { // copy header |
128 |
String prefix = headerparser.getPrefix();
|
129 |
if (prefix.length() > 0) |
130 |
writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname) |
131 |
else
|
132 |
writer.writeStartElement(localname); |
133 |
for (int i = 0 ; i < headerparser.getNamespaceCount(); i++) |
134 |
writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i)); |
135 |
for (int i = 0 ; i < headerparser.getAttributeCount(); i++) |
136 |
writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i)); |
137 |
} |
138 |
if (!captureTheTag && localname == containertag) // start copy after the tag |
139 |
start = true;
|
140 |
} else if (event == XMLStreamConstants.END_ELEMENT) { |
141 |
localname = headerparser.getLocalName(); |
142 |
if (!captureTheTag && localname == containertag)
|
143 |
break;// stop looping |
144 |
|
145 |
if (start)
|
146 |
writer.writeEndElement(); |
147 |
|
148 |
if (captureTheTag && localname == containertag)
|
149 |
break;// stop looping |
150 |
} else if (event == XMLStreamConstants.CHARACTERS) { |
151 |
if (start)
|
152 |
writer.writeCharacters(headerparser.getText()); |
153 |
} else if (event == XMLStreamConstants.COMMENT) { |
154 |
if (start)
|
155 |
writer.writeComment(headerparser.getText()); |
156 |
} |
157 |
} |
158 |
headerparser.close(); |
159 |
} |
160 |
|
161 |
/**
|
162 |
* find all refs.
|
163 |
*
|
164 |
* @return the list of link parser
|
165 |
*/
|
166 |
private LinkedHashSet<String> findGrpLink() |
167 |
{ |
168 |
LinkedHashSet<String> links = new LinkedHashSet<String>(); |
169 |
anainputData = anaurl.openStream(); |
170 |
anaparser = anafactory.createXMLStreamReader(anainputData); |
171 |
|
172 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) { |
173 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
174 |
if (anaparser.getLocalName().equals("linkGrp")) { |
175 |
String targetsvalue = anaparser.getAttributeValue(0) |
176 |
|
177 |
if (links.contains(targetsvalue)) {
|
178 |
System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used") |
179 |
} else {
|
180 |
links.add(targetsvalue); // add the taxonomy type
|
181 |
} |
182 |
} else if (anaparser.getLocalName().equals("respStmt")) { |
183 |
respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only |
184 |
} |
185 |
} |
186 |
} |
187 |
anaparser.close(); |
188 |
return links;
|
189 |
} |
190 |
|
191 |
/**
|
192 |
* Builds the link parsers.
|
193 |
* I need to know what groups exists to build a parser per taxonomy and go to the first link element
|
194 |
*/
|
195 |
private void buildLinkParsers() { |
196 |
|
197 |
// link group of the standoff file
|
198 |
links = findGrpLink(); |
199 |
linkparsers = new LinkedHashMap<String, XMLStreamReader>(); |
200 |
|
201 |
// build one parser per link group
|
202 |
for (String link : links) { // build a parser per group |
203 |
anainputData = new BufferedInputStream(anaurl.openStream()); |
204 |
linkparsers.put(link, anafactory.createXMLStreamReader(anainputData)); |
205 |
} |
206 |
|
207 |
//for each parser
|
208 |
for (String link : links) { |
209 |
anaparser = linkparsers.get(link); |
210 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) { |
211 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
212 |
if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group |
213 |
String targetsvalue = anaparser.getAttributeValue(0) |
214 |
if (targetsvalue.equals(link)) {
|
215 |
break; // next element is a link start tag |
216 |
} |
217 |
} |
218 |
} |
219 |
} |
220 |
} |
221 |
} |
222 |
|
223 |
public boolean process(File outfile) throws XMLStreamException, IOException |
224 |
{ |
225 |
boolean ret = super.process(outfile); |
226 |
releaseLinkParsers(); |
227 |
} |
228 |
|
229 |
/**
|
230 |
* Release the link parsers.
|
231 |
*/
|
232 |
private void releaseLinkParsers() { |
233 |
if (linkparsers == null) return; |
234 |
|
235 |
for (String l : linkparsers.keySet()) { |
236 |
XMLStreamReader p = linkparsers.get(l); |
237 |
if (p != null) { |
238 |
try {
|
239 |
p.close(); |
240 |
} catch(Exception e) { |
241 |
println "** Can not close $l link parser $p: $e"
|
242 |
} |
243 |
} |
244 |
} |
245 |
} |
246 |
|
247 |
/**
|
248 |
* get the next tei:link value of a tei:LinkGrp.
|
249 |
*
|
250 |
* @param link the link
|
251 |
* @return the next ana
|
252 |
*/
|
253 |
private String getNextAnaValue(String link, String wordId) { |
254 |
anaparser = linkparsers.get(link); |
255 |
def m;
|
256 |
for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) { |
257 |
if (event == XMLStreamConstants.START_ELEMENT) {
|
258 |
if (anaparser.getLocalName().equals("link")) { |
259 |
String targetsvalue = anaparser.getAttributeValue(0) |
260 |
if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe |
261 |
def g1 = m[0][1]; |
262 |
def g2 = m[0][2]; |
263 |
|
264 |
String anavalue = g2;
|
265 |
anavalue = anavalue.replace("<", "<") |
266 |
return anavalue;
|
267 |
} else {
|
268 |
System.err.println("Error: getNextAna(): link target is not well formed: = "+anaparser.getAttributeValue(0)); |
269 |
} |
270 |
} |
271 |
} |
272 |
} |
273 |
return ""; |
274 |
} |
275 |
|
276 |
/**
|
277 |
* build the ana tags of a word.
|
278 |
*
|
279 |
* @param wordId the word id
|
280 |
* @return the ana tag
|
281 |
*/
|
282 |
private void writeAnaTags(String wordId) |
283 |
{ |
284 |
String anabalises ="\n"; |
285 |
for (String link : links) { |
286 |
writer.writeStartElement(TXMNS, "ana");
|
287 |
writer.writeAttribute("resp", "#"+respStmtID); |
288 |
writer.writeAttribute("type", "#"+link); |
289 |
if (replace) {
|
290 |
if (linkparsers.containsKey(link)) { // order is important |
291 |
writer.writeCharacters(getNextAnaValue(link, wordId)); |
292 |
} else {
|
293 |
writer.writeCharacters(anaValues.get(link)); |
294 |
} |
295 |
} else { // ok no problem |
296 |
writer.writeCharacters(getNextAnaValue(link, wordId)); |
297 |
} |
298 |
|
299 |
writer.writeEndElement(); // txm:ana
|
300 |
} |
301 |
} |
302 |
|
303 |
String wordId;
|
304 |
HashMap<String, String> anaValues = new HashMap<String, String>(); |
305 |
boolean flagSourceDesc = false, flagW = false, flagAna = false; |
306 |
String type = null, resp = null, anaValue= ""; |
307 |
protected void processStartElement() { |
308 |
|
309 |
if (localname.equals("taxonomy")) { |
310 |
String taxo = parser.getAttributeValue(0) // taxonomy type |
311 |
presentTaxonomies.add(taxo); |
312 |
} else if (flagW && replace && localname.equals("ana")) { |
313 |
flagAna = true
|
314 |
anaValue= ""
|
315 |
type = null
|
316 |
resp = null
|
317 |
for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) { |
318 |
if (parser.getAttributeLocalName(i) == "resp") { |
319 |
resp = parser.getAttributeValue(i); |
320 |
} else if (parser.getAttributeLocalName(i) == "type") { |
321 |
type = parser.getAttributeValue(i); |
322 |
} |
323 |
} |
324 |
if (type != null) type = type.substring(1); |
325 |
if (resp != null) resp = resp.substring(1); |
326 |
return; // don't write the "ana" start element |
327 |
} else if (localname.equals("w")) { |
328 |
for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) { |
329 |
if (parser.getAttributeLocalName(i) == "id") { |
330 |
wordId = parser.getAttributeValue(i); |
331 |
break
|
332 |
} |
333 |
} |
334 |
flagW = true
|
335 |
anaValues.clear() |
336 |
} |
337 |
|
338 |
super.processStartElement();
|
339 |
} |
340 |
|
341 |
protected void processCharacters() { |
342 |
if (flagAna) anaValue += parser.getText();
|
343 |
else super.processCharacters(); // FORM CONTENT LOST !!!!!!!!!!!!! |
344 |
} |
345 |
|
346 |
boolean applicationWritten = false; |
347 |
boolean taxonomiesWritten = false; |
348 |
protected void processEndElement() { |
349 |
switch (parser.getLocalName()) {
|
350 |
case "w": |
351 |
writeAnaTags(wordId); |
352 |
flagW = false
|
353 |
break;
|
354 |
case "ana": |
355 |
if (flagAna && replace && type != null && resp != null && anaValue != null) { |
356 |
anaValues.put(type, anaValue) |
357 |
links.add(type) |
358 |
flagAna = false
|
359 |
return; // don't write the "ana" end element |
360 |
} |
361 |
flagAna = false
|
362 |
break;
|
363 |
|
364 |
case "appInfo": |
365 |
applicationWritten = true;
|
366 |
getHeaderInfos("appInfo", false); |
367 |
break;
|
368 |
|
369 |
case "classDecl": |
370 |
taxonomiesWritten = true;
|
371 |
getHeaderInfos("classDecl", false); |
372 |
break;
|
373 |
|
374 |
case "encodingDesc": |
375 |
if (!applicationWritten) {
|
376 |
writer.writeStartElement("appInfo");
|
377 |
getHeaderInfos("appInfo", false); |
378 |
writer.writeEndElement(); // appInfo
|
379 |
} |
380 |
if (!taxonomiesWritten) {
|
381 |
writer.writeStartElement("classDecl");
|
382 |
getHeaderInfos("classDecl", false); |
383 |
writer.writeEndElement(); // classDecl
|
384 |
} |
385 |
break;
|
386 |
|
387 |
case "titleStmt": |
388 |
if (flagSourceDesc) {
|
389 |
//output.write(this.respStmt+"\n")
|
390 |
getHeaderInfos("respStmt", true); |
391 |
flagSourceDesc = false;
|
392 |
break;
|
393 |
} |
394 |
break;
|
395 |
} |
396 |
super.processEndElement();
|
397 |
} |
398 |
|
399 |
/** The declarenamespace. */
|
400 |
boolean declarenamespace = false; |
401 |
|
402 |
/**
|
403 |
* Declare namespace.
|
404 |
*
|
405 |
* @return the java.lang. object
|
406 |
*/
|
407 |
private declareNamespace() {
|
408 |
if (!declarenamespace) {
|
409 |
writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
|
410 |
writer.writeNamespace("txm", TXMNS);
|
411 |
declarenamespace = true;
|
412 |
} |
413 |
} |
414 |
|
415 |
/**
|
416 |
* The main method.
|
417 |
*
|
418 |
* @param args the arguments
|
419 |
*/
|
420 |
public static void main(String[] args) { |
421 |
|
422 |
String rootDir = "~/xml/rgaqcj/"; |
423 |
new File(rootDir + "/injection/").mkdir(); |
424 |
|
425 |
def milestones = ["tagUsage", "pb", "lb","catRef"]// the tags who |
426 |
|
427 |
File srcfile = new File(rootDir, "/anainline/", "roland.xml"); |
428 |
File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml"); |
429 |
|
430 |
File src2file = new File(rootDir, "/injection/", "roland.xml"); |
431 |
File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml"); |
432 |
|
433 |
println("process file : " + srcfile + " with : " + pos1file); |
434 |
def builder = new AnnotationInjection(srcfile.toURI().toURL(), |
435 |
pos1file.toURI().toURL(), milestones); |
436 |
builder.transfomFile(new File(rootDir + "/injection/", "roland.xml")); |
437 |
|
438 |
println("process file : " + src2file + " with : " + pos1file); |
439 |
builder = new AnnotationInjection(src2file.toURI().toURL(), pos2file.toURI().toURL(),
|
440 |
milestones); |
441 |
builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml"); |
442 |
|
443 |
return;
|
444 |
} |
445 |
} |