Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / AnnotationInjection.groovy @ 1688

History | View | Annotate | Download (12.7 kB)

1

    
2

    
3
// Copyright © 2010-2013 ENS de Lyon.
4
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5
// Lyon 2, University of Franche-Comté, University of Nice
6
// Sophia Antipolis, University of Paris 3.
7
//
8
// The TXM platform is free software: you can redistribute it
9
// and/or modify it under the terms of the GNU General Public
10
// License as published by the Free Software Foundation,
11
// either version 2 of the License, or (at your option) any
12
// later version.
13
//
14
// The TXM platform is distributed in the hope that it will be
15
// useful, but WITHOUT ANY WARRANTY; without even the implied
16
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17
// PURPOSE. See the GNU General Public License for more
18
// details.
19
//
20
// You should have received a copy of the GNU General
21
// Public License along with the TXM platform. If not, see
22
// http://www.gnu.org/licenses.
23
//
24
//
25
//
26
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
27
// $LastChangedRevision: 3185 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.importer.scripts.xmltxm
31

    
32
import javax.xml.stream.*
33

    
34
import org.txm.importer.StaxIdentityParser
35
import org.txm.importer.filters.*
36
import org.txm.importer.PersonalNamespaceContext
37
import java.io.File
38
import java.io.IOException
39

    
40
import javax.xml.stream.XMLStreamException
41

    
42
/**
43
 * The Class AnnotationInjection.
44
 *
45
 * @author mdecorde
46
 * 
47
 * inject annotation from a stand-off file into a xml-tei-txm
48
 * file
49
 */
50

    
51
public class AnnotationInjection extends StaxIdentityParser {
52

    
53
        public static String TXMNS = "http://textometrie.org/1.0"
54

    
55
        /** The xml reader factory. */
56
        private def factory;
57

    
58
        /** The links. */
59
        private LinkedHashSet<String> links;
60
        boolean replace
61

    
62
        /** The linkparsers. key=type*/
63
        private LinkedHashMap<String, XMLStreamReader> linkparsers;
64

    
65
        /** The anaurl. */
66
        private def anaurl;
67

    
68
        /** The anainput data. */
69
        private def anainputData;
70

    
71
        /** The anafactory. */
72
        private XMLInputFactory anafactory = XMLInputFactory.newInstance();
73

    
74
        /** The anaparser. */
75
        private XMLStreamReader anaparser;
76
        private XMLStreamReader headerparser;
77

    
78
        /** The resp stmt id. */
79
        String respStmtID = "";
80

    
81
        /** The present taxonomies. */
82
        ArrayList<String> presentTaxonomies = new ArrayList();
83

    
84
        /**
85
         * Instantiates a new annotation injection.
86
         *
87
         * @param url the xml-tei-txm file
88
         * @param anaurl the stand-off file
89
         */
90
        public AnnotationInjection(URL url, URL anaurl) {
91
                this(url, anaurl, false)
92
        }
93

    
94
        /**
95
         * Instantiates a new annotation injection.
96
         *
97
         * @param url the xml-tei-txm file
98
         * @param anaurl the stand-off file
99
         */
100
        public AnnotationInjection(URL url, URL anaurl, boolean replace) {
101
                super(url); // init reader and writer
102
                try {
103
                        this.anaurl = anaurl;
104
                        this.replace = replace
105
                        factory = XMLInputFactory.newInstance();
106
                        this.buildLinkParsers();// build a parser per linkgroup
107
                } catch (XMLStreamException ex) {
108
                        System.out.println(ex);
109
                } catch (IOException ex) {
110
                        System.out.println("IOException while parsing ");
111
                }
112
        }
113

    
114
        private void getHeaderInfos(String containertag, boolean captureTheTag)
115
        {
116
                anainputData = new BufferedInputStream(anaurl.openStream());
117
                headerparser = anafactory.createXMLStreamReader(anainputData);
118
                boolean start = false;
119
                String localname;
120
                for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next())
121
                {
122
                        //String prefix = headerparser.getNamespaceURI();
123
                        if (event == XMLStreamConstants.START_ELEMENT) {
124
                                localname = headerparser.getLocalName();
125
                                if (captureTheTag && localname == containertag) // start copy  after the tag
126
                                        start = true;
127
                                if (start) { // copy header
128
                                        String prefix = headerparser.getPrefix();
129
                                        if (prefix.length() > 0)
130
                                                writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
131
                                        else
132
                                                writer.writeStartElement(localname);
133
                                        for (int i = 0 ; i < headerparser.getNamespaceCount(); i++)
134
                                                writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i));
135
                                        for (int i = 0 ; i < headerparser.getAttributeCount(); i++)
136
                                                writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i));
137
                                }
138
                                if (!captureTheTag && localname == containertag) // start copy  after the tag
139
                                        start = true;
140
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
141
                                localname = headerparser.getLocalName();
142
                                if (!captureTheTag && localname == containertag)
143
                                        break;// stop looping
144

    
145
                                if (start)
146
                                        writer.writeEndElement();
147

    
148
                                if (captureTheTag && localname == containertag)
149
                                        break;// stop looping
150
                        } else if (event == XMLStreamConstants.CHARACTERS) {
151
                                if (start)
152
                                        writer.writeCharacters(headerparser.getText());
153
                        } else if (event == XMLStreamConstants.COMMENT) {
154
                                if (start)
155
                                        writer.writeComment(headerparser.getText());
156
                        }
157
                }
158
                headerparser.close();
159
                anainputData.close();
160
        }
161

    
162
        /**
163
         * find all refs.
164
         *
165
         * @return the list of link parser
166
         */
167
        private LinkedHashSet<String> findGrpLink()
168
        {
169
                LinkedHashSet<String> links = new LinkedHashSet<String>();
170
                anainputData = anaurl.openStream();
171
                anaparser = anafactory.createXMLStreamReader(anainputData);
172

    
173
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
174
                        if (event == XMLStreamConstants.START_ELEMENT) {
175
                                if (anaparser.getLocalName().equals("linkGrp")) {
176
                                        String targetsvalue = anaparser.getAttributeValue(0)
177

    
178
                                        if (links.contains(targetsvalue)) {
179
                                                System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
180
                                        } else {
181
                                                links.add(targetsvalue); // add the taxonomy type
182
                                        }
183
                                } else if (anaparser.getLocalName().equals("respStmt")) {
184
                                        respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only
185
                                }
186
                        }
187
                }
188
                anaparser.close();
189
                anainputData.close();
190
                return links;
191
        }
192

    
193
        /**
194
         * Builds the link parsers.
195
         * I need to know what groups exists to build a parser per taxonomy and go to the first link element
196
         */
197
        private void buildLinkParsers() {
198
                
199
                // link group of the standoff file
200
                links = findGrpLink();
201
                linkparsers = new LinkedHashMap<String, XMLStreamReader>();
202

    
203
                // build one parser per link group
204
                for (String link : links) { // build a parser per group
205
                        anainputData = new BufferedInputStream(anaurl.openStream());
206
                        linkparsers.put(link, anafactory.createXMLStreamReader(anainputData));
207
                }
208

    
209
                //for each parser
210
                for (String link : links) {
211
                        anaparser = linkparsers.get(link);
212
                        for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
213
                                if (event == XMLStreamConstants.START_ELEMENT) {
214
                                        if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
215
                                                String targetsvalue = anaparser.getAttributeValue(0)
216
                                                if (targetsvalue.equals(link)) {
217
                                                        break; // next element is a link start tag
218
                                                }
219
                                        }
220
                                }
221
                        }
222
                }
223
        }
224
        
225
        public boolean process(File outfile) throws XMLStreamException, IOException
226
        {
227
                boolean ret = super.process(outfile);
228
                releaseLinkParsers();
229
        }
230
        
231
        /**
232
         * Release the link parsers.
233
         */
234
        private void releaseLinkParsers() {
235
                if (linkparsers == null) return;
236
                
237
                for (String l : linkparsers.keySet()) {
238
                        XMLStreamReader p = linkparsers.get(l);
239
                        if (p != null) {
240
                                try {
241
                                        p.close();
242
                                } catch(Exception e) {
243
                                        println "** Can not close $l link parser $p: $e"
244
                                }
245
                        }
246
                }
247
        }
248

    
249
        /**
250
         * get the next tei:link value of a tei:LinkGrp.
251
         *
252
         * @param link the link
253
         * @return the next ana
254
         */
255
        private String getNextAnaValue(String link, String wordId) {
256
                anaparser = linkparsers.get(link);
257
                def m;
258
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
259
                        if (event == XMLStreamConstants.START_ELEMENT) {
260
                                if (anaparser.getLocalName().equals("link")) {
261
                                        String targetsvalue = anaparser.getAttributeValue(0)
262
                                        if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
263
                                                def g1 = m[0][1];
264
                                                def g2 = m[0][2];
265

    
266
                                                String anavalue = g2;
267
                                                anavalue = anavalue.replace("&lt;", "<")
268
                                                return anavalue;
269
                                        } else {
270
                                                System.err.println("Error: getNextAna(): link target is not well formed:  = "+anaparser.getAttributeValue(0));
271
                                        }
272
                                }
273
                        }
274
                }
275
                return "";
276
        }
277

    
278
        /**
279
         * build the ana tags of a word.
280
         *
281
         * @param wordId the word id
282
         * @return the ana tag
283
         */
284
        private void writeAnaTags(String wordId)
285
        {
286
                String anabalises ="\n";
287
                for (String link : links) {
288
                        writer.writeStartElement(TXMNS, "ana");
289
                        writer.writeAttribute("resp", "#"+respStmtID);
290
                        writer.writeAttribute("type", "#"+link);
291
                        if (replace) {
292
                                if (linkparsers.containsKey(link)) { // order is important
293
                                        writer.writeCharacters(getNextAnaValue(link, wordId));
294
                                } else {
295
                                        writer.writeCharacters(anaValues.get(link));
296
                                }
297
                        } else { // ok no problem
298
                                writer.writeCharacters(getNextAnaValue(link, wordId));
299
                        }
300
                        
301
                        writer.writeEndElement(); // txm:ana
302
                }
303
        }
304

    
305
        String wordId;
306
        HashMap<String, String> anaValues = new HashMap<String, String>();
307
        boolean flagSourceDesc = false, flagW = false, flagAna = false;
308
        String type = null, resp = null, anaValue= "";
309
        protected void processStartElement() {
310
                
311
                if (localname.equals("taxonomy")) {
312
                        String taxo = parser.getAttributeValue(0) // taxonomy type
313
                        presentTaxonomies.add(taxo);
314
                } else if (flagW && replace && localname.equals("ana")) {
315
                        flagAna = true
316
                        anaValue= ""
317
                        type = null
318
                        resp = null
319
                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
320
                                if (parser.getAttributeLocalName(i) == "resp") {
321
                                        resp = parser.getAttributeValue(i);
322
                                } else if (parser.getAttributeLocalName(i) == "type") {
323
                                        type = parser.getAttributeValue(i);
324
                                }
325
                        }
326
                        if (type != null) type = type.substring(1);
327
                        if (resp != null) resp = resp.substring(1);
328
                        return; // don't write the "ana" start element
329
                } else if (localname.equals("w")) {
330
                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
331
                                if (parser.getAttributeLocalName(i) == "id") {
332
                                        wordId = parser.getAttributeValue(i);
333
                                        break
334
                                }
335
                        }
336
                        flagW = true
337
                        anaValues.clear()
338
                }
339

    
340
                super.processStartElement();
341
        }
342

    
343
        protected void processCharacters() {
344
                if (flagAna) anaValue += parser.getText();
345
                else super.processCharacters(); // FORM CONTENT LOST !!!!!!!!!!!!!
346
        }
347

    
348
        boolean applicationWritten = false;
349
        boolean taxonomiesWritten = false;
350
        protected void processEndElement() {
351
                switch (parser.getLocalName()) {
352
                        case "w":
353
                                writeAnaTags(wordId);
354
                                flagW = false
355
                                break;
356
                        case "ana":
357
                                if (flagAna && replace && type != null && resp != null && anaValue != null) {
358
                                        anaValues.put(type, anaValue)
359
                                        links.add(type)
360
                                        flagAna = false
361
                                        return; // don't write the "ana" end element
362
                                }
363
                                flagAna = false
364
                                break;
365

    
366
                        case "appInfo":
367
                                applicationWritten = true;
368
                                getHeaderInfos("appInfo", false);
369
                                break;
370

    
371
                        case "classDecl":
372
                                taxonomiesWritten = true;
373
                                getHeaderInfos("classDecl", false);
374
                                break;
375

    
376
                        case "encodingDesc":
377
                                if (!applicationWritten) {
378
                                        writer.writeStartElement("appInfo");
379
                                        getHeaderInfos("appInfo", false);
380
                                        writer.writeEndElement(); // appInfo
381
                                }
382
                                if (!taxonomiesWritten) {
383
                                        writer.writeStartElement("classDecl");
384
                                        getHeaderInfos("classDecl", false);
385
                                        writer.writeEndElement(); // classDecl
386
                                }
387
                                break;
388

    
389
                        case "titleStmt":
390
                                if (flagSourceDesc) {
391
                                        //output.write(this.respStmt+"\n")
392
                                        getHeaderInfos("respStmt", true);
393
                                        flagSourceDesc = false;
394
                                        break;
395
                                }
396
                                break;
397
                }
398
                super.processEndElement();
399
        }
400

    
401
        /** The declarenamespace. */
402
        boolean declarenamespace = false;
403

    
404
        /**
405
         * Declare namespace.
406
         *
407
         * @return the java.lang. object
408
         */
409
        private declareNamespace() {
410
                if (!declarenamespace) {
411
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
412
                        writer.writeNamespace("txm", TXMNS);
413
                        declarenamespace = true;
414
                }
415
        }
416

    
417
        /**
418
         * The main method.
419
         *
420
         * @param args the arguments
421
         */
422
        public static void main(String[] args) {
423

    
424
                String rootDir = "~/xml/rgaqcj/";
425
                new File(rootDir + "/injection/").mkdir();
426

    
427
                def milestones = ["tagUsage", "pb", "lb","catRef"]// the tags who
428

    
429
                File srcfile = new File(rootDir, "/anainline/", "roland.xml");
430
                File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml");
431

    
432
                File src2file = new File(rootDir, "/injection/", "roland.xml");
433
                File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml");
434

    
435
                println("process file : " + srcfile + " with : " + pos1file);
436
                def builder = new AnnotationInjection(srcfile.toURI().toURL(),
437
                                pos1file.toURI().toURL(), milestones);
438
                builder.transfomFile(new File(rootDir + "/injection/", "roland.xml"));
439

    
440
                println("process file : " + src2file + " with : " + pos1file);
441
                builder = new AnnotationInjection(src2file.toURI().toURL(), pos2file.toURI().toURL(),
442
                                milestones);
443
                builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml");
444

    
445
                return;
446
        }
447
}