Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / AnnotationInjection.groovy @ 1681

History | View | Annotate | Download (12.7 kB)

1

    
2

    
3
// Copyright © 2010-2013 ENS de Lyon.
4
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
5
// Lyon 2, University of Franche-Comté, University of Nice
6
// Sophia Antipolis, University of Paris 3.
7
//
8
// The TXM platform is free software: you can redistribute it
9
// and/or modify it under the terms of the GNU General Public
10
// License as published by the Free Software Foundation,
11
// either version 2 of the License, or (at your option) any
12
// later version.
13
//
14
// The TXM platform is distributed in the hope that it will be
15
// useful, but WITHOUT ANY WARRANTY; without even the implied
16
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17
// PURPOSE. See the GNU General Public License for more
18
// details.
19
//
20
// You should have received a copy of the GNU General
21
// Public License along with the TXM platform. If not, see
22
// http://www.gnu.org/licenses.
23
//
24
//
25
//
26
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
27
// $LastChangedRevision: 3185 $
28
// $LastChangedBy: mdecorde $
29
//
30
package org.txm.importer.scripts.xmltxm
31

    
32
import javax.xml.stream.*
33

    
34
import org.txm.importer.StaxIdentityParser
35
import org.txm.importer.filters.*
36
import org.txm.importer.PersonalNamespaceContext
37
import java.io.File
38
import java.io.IOException
39

    
40
import javax.xml.stream.XMLStreamException
41

    
42
/**
43
 * The Class AnnotationInjection.
44
 *
45
 * @author mdecorde
46
 * 
47
 * inject annotation from a stand-off file into a xml-tei-txm
48
 * file
49
 */
50

    
51
public class AnnotationInjection extends StaxIdentityParser {
52

    
53
        public static String TXMNS = "http://textometrie.org/1.0"
54

    
55
        /** The xml reader factory. */
56
        private def factory;
57

    
58
        /** The links. */
59
        private LinkedHashSet<String> links;
60
        boolean replace
61

    
62
        /** The linkparsers. key=type*/
63
        private LinkedHashMap<String, XMLStreamReader> linkparsers;
64

    
65
        /** The anaurl. */
66
        private def anaurl;
67

    
68
        /** The anainput data. */
69
        private def anainputData;
70

    
71
        /** The anafactory. */
72
        private XMLInputFactory anafactory = XMLInputFactory.newInstance();
73

    
74
        /** The anaparser. */
75
        private XMLStreamReader anaparser;
76
        private XMLStreamReader headerparser;
77

    
78
        /** The resp stmt id. */
79
        String respStmtID = "";
80

    
81
        /** The present taxonomies. */
82
        ArrayList<String> presentTaxonomies = new ArrayList();
83

    
84
        /**
85
         * Instantiates a new annotation injection.
86
         *
87
         * @param url the xml-tei-txm file
88
         * @param anaurl the stand-off file
89
         */
90
        public AnnotationInjection(URL url, URL anaurl) {
91
                this(url, anaurl, false)
92
        }
93

    
94
        /**
95
         * Instantiates a new annotation injection.
96
         *
97
         * @param url the xml-tei-txm file
98
         * @param anaurl the stand-off file
99
         */
100
        public AnnotationInjection(URL url, URL anaurl, boolean replace) {
101
                super(url); // init reader and writer
102
                try {
103
                        this.anaurl = anaurl;
104
                        this.replace = replace
105
                        factory = XMLInputFactory.newInstance();
106
                        this.buildLinkParsers();// build a parser per linkgroup
107
                } catch (XMLStreamException ex) {
108
                        System.out.println(ex);
109
                } catch (IOException ex) {
110
                        System.out.println("IOException while parsing ");
111
                }
112
        }
113

    
114
        private void getHeaderInfos(String containertag, boolean captureTheTag)
115
        {
116
                anainputData = new BufferedInputStream(anaurl.openStream());
117
                headerparser = anafactory.createXMLStreamReader(anainputData);
118
                boolean start = false;
119
                String localname;
120
                for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next())
121
                {
122
                        //String prefix = headerparser.getNamespaceURI();
123
                        if (event == XMLStreamConstants.START_ELEMENT) {
124
                                localname = headerparser.getLocalName();
125
                                if (captureTheTag && localname == containertag) // start copy  after the tag
126
                                        start = true;
127
                                if (start) { // copy header
128
                                        String prefix = headerparser.getPrefix();
129
                                        if (prefix.length() > 0)
130
                                                writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
131
                                        else
132
                                                writer.writeStartElement(localname);
133
                                        for (int i = 0 ; i < headerparser.getNamespaceCount(); i++)
134
                                                writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i));
135
                                        for (int i = 0 ; i < headerparser.getAttributeCount(); i++)
136
                                                writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i));
137
                                }
138
                                if (!captureTheTag && localname == containertag) // start copy  after the tag
139
                                        start = true;
140
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
141
                                localname = headerparser.getLocalName();
142
                                if (!captureTheTag && localname == containertag)
143
                                        break;// stop looping
144

    
145
                                if (start)
146
                                        writer.writeEndElement();
147

    
148
                                if (captureTheTag && localname == containertag)
149
                                        break;// stop looping
150
                        } else if (event == XMLStreamConstants.CHARACTERS) {
151
                                if (start)
152
                                        writer.writeCharacters(headerparser.getText());
153
                        } else if (event == XMLStreamConstants.COMMENT) {
154
                                if (start)
155
                                        writer.writeComment(headerparser.getText());
156
                        }
157
                }
158
                headerparser.close();
159
        }
160

    
161
        /**
162
         * find all refs.
163
         *
164
         * @return the list of link parser
165
         */
166
        private LinkedHashSet<String> findGrpLink()
167
        {
168
                LinkedHashSet<String> links = new LinkedHashSet<String>();
169
                anainputData = anaurl.openStream();
170
                anaparser = anafactory.createXMLStreamReader(anainputData);
171

    
172
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
173
                        if (event == XMLStreamConstants.START_ELEMENT) {
174
                                if (anaparser.getLocalName().equals("linkGrp")) {
175
                                        String targetsvalue = anaparser.getAttributeValue(0)
176

    
177
                                        if (links.contains(targetsvalue)) {
178
                                                System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
179
                                        } else {
180
                                                links.add(targetsvalue); // add the taxonomy type
181
                                        }
182
                                } else if (anaparser.getLocalName().equals("respStmt")) {
183
                                        respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only
184
                                }
185
                        }
186
                }
187
                anaparser.close();
188
                return links;
189
        }
190

    
191
        /**
192
         * Builds the link parsers.
193
         * I need to know what groups exists to build a parser per taxonomy and go to the first link element
194
         */
195
        private void buildLinkParsers() {
196
                
197
                // link group of the standoff file
198
                links = findGrpLink();
199
                linkparsers = new LinkedHashMap<String, XMLStreamReader>();
200

    
201
                // build one parser per link group
202
                for (String link : links) { // build a parser per group
203
                        anainputData = new BufferedInputStream(anaurl.openStream());
204
                        linkparsers.put(link, anafactory.createXMLStreamReader(anainputData));
205
                }
206

    
207
                //for each parser
208
                for (String link : links) {
209
                        anaparser = linkparsers.get(link);
210
                        for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
211
                                if (event == XMLStreamConstants.START_ELEMENT) {
212
                                        if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
213
                                                String targetsvalue = anaparser.getAttributeValue(0)
214
                                                if (targetsvalue.equals(link)) {
215
                                                        break; // next element is a link start tag
216
                                                }
217
                                        }
218
                                }
219
                        }
220
                }
221
        }
222
        
223
        public boolean process(File outfile) throws XMLStreamException, IOException
224
        {
225
                boolean ret = super.process(outfile);
226
                releaseLinkParsers();
227
        }
228
        
229
        /**
230
         * Release the link parsers.
231
         */
232
        private void releaseLinkParsers() {
233
                if (linkparsers == null) return;
234
                
235
                for (String l : linkparsers.keySet()) {
236
                        XMLStreamReader p = linkparsers.get(l);
237
                        if (p != null) {
238
                                try {
239
                                        p.close();
240
                                } catch(Exception e) {
241
                                        println "** Can not close $l link parser $p: $e"
242
                                }
243
                        }
244
                }
245
        }
246

    
247
        /**
248
         * get the next tei:link value of a tei:LinkGrp.
249
         *
250
         * @param link the link
251
         * @return the next ana
252
         */
253
        private String getNextAnaValue(String link, String wordId) {
254
                anaparser = linkparsers.get(link);
255
                def m;
256
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
257
                        if (event == XMLStreamConstants.START_ELEMENT) {
258
                                if (anaparser.getLocalName().equals("link")) {
259
                                        String targetsvalue = anaparser.getAttributeValue(0)
260
                                        if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
261
                                                def g1 = m[0][1];
262
                                                def g2 = m[0][2];
263

    
264
                                                String anavalue = g2;
265
                                                anavalue = anavalue.replace("&lt;", "<")
266
                                                return anavalue;
267
                                        } else {
268
                                                System.err.println("Error: getNextAna(): link target is not well formed:  = "+anaparser.getAttributeValue(0));
269
                                        }
270
                                }
271
                        }
272
                }
273
                return "";
274
        }
275

    
276
        /**
277
         * build the ana tags of a word.
278
         *
279
         * @param wordId the word id
280
         * @return the ana tag
281
         */
282
        private void writeAnaTags(String wordId)
283
        {
284
                String anabalises ="\n";
285
                for (String link : links) {
286
                        writer.writeStartElement(TXMNS, "ana");
287
                        writer.writeAttribute("resp", "#"+respStmtID);
288
                        writer.writeAttribute("type", "#"+link);
289
                        if (replace) {
290
                                if (linkparsers.containsKey(link)) { // order is important
291
                                        writer.writeCharacters(getNextAnaValue(link, wordId));
292
                                } else {
293
                                        writer.writeCharacters(anaValues.get(link));
294
                                }
295
                        } else { // ok no problem
296
                                writer.writeCharacters(getNextAnaValue(link, wordId));
297
                        }
298
                        
299
                        writer.writeEndElement(); // txm:ana
300
                }
301
        }
302

    
303
        String wordId;
304
        HashMap<String, String> anaValues = new HashMap<String, String>();
305
        boolean flagSourceDesc = false, flagW = false, flagAna = false;
306
        String type = null, resp = null, anaValue= "";
307
        protected void processStartElement() {
308
                
309
                if (localname.equals("taxonomy")) {
310
                        String taxo = parser.getAttributeValue(0) // taxonomy type
311
                        presentTaxonomies.add(taxo);
312
                } else if (flagW && replace && localname.equals("ana")) {
313
                        flagAna = true
314
                        anaValue= ""
315
                        type = null
316
                        resp = null
317
                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
318
                                if (parser.getAttributeLocalName(i) == "resp") {
319
                                        resp = parser.getAttributeValue(i);
320
                                } else if (parser.getAttributeLocalName(i) == "type") {
321
                                        type = parser.getAttributeValue(i);
322
                                }
323
                        }
324
                        if (type != null) type = type.substring(1);
325
                        if (resp != null) resp = resp.substring(1);
326
                        return; // don't write the "ana" start element
327
                } else if (localname.equals("w")) {
328
                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
329
                                if (parser.getAttributeLocalName(i) == "id") {
330
                                        wordId = parser.getAttributeValue(i);
331
                                        break
332
                                }
333
                        }
334
                        flagW = true
335
                        anaValues.clear()
336
                }
337

    
338
                super.processStartElement();
339
        }
340

    
341
        protected void processCharacters() {
342
                if (flagAna) anaValue += parser.getText();
343
                else super.processCharacters(); // FORM CONTENT LOST !!!!!!!!!!!!!
344
        }
345

    
346
        boolean applicationWritten = false;
347
        boolean taxonomiesWritten = false;
348
        protected void processEndElement() {
349
                switch (parser.getLocalName()) {
350
                        case "w":
351
                                writeAnaTags(wordId);
352
                                flagW = false
353
                                break;
354
                        case "ana":
355
                                if (flagAna && replace && type != null && resp != null && anaValue != null) {
356
                                        anaValues.put(type, anaValue)
357
                                        links.add(type)
358
                                        flagAna = false
359
                                        return; // don't write the "ana" end element
360
                                }
361
                                flagAna = false
362
                                break;
363

    
364
                        case "appInfo":
365
                                applicationWritten = true;
366
                                getHeaderInfos("appInfo", false);
367
                                break;
368

    
369
                        case "classDecl":
370
                                taxonomiesWritten = true;
371
                                getHeaderInfos("classDecl", false);
372
                                break;
373

    
374
                        case "encodingDesc":
375
                                if (!applicationWritten) {
376
                                        writer.writeStartElement("appInfo");
377
                                        getHeaderInfos("appInfo", false);
378
                                        writer.writeEndElement(); // appInfo
379
                                }
380
                                if (!taxonomiesWritten) {
381
                                        writer.writeStartElement("classDecl");
382
                                        getHeaderInfos("classDecl", false);
383
                                        writer.writeEndElement(); // classDecl
384
                                }
385
                                break;
386

    
387
                        case "titleStmt":
388
                                if (flagSourceDesc) {
389
                                        //output.write(this.respStmt+"\n")
390
                                        getHeaderInfos("respStmt", true);
391
                                        flagSourceDesc = false;
392
                                        break;
393
                                }
394
                                break;
395
                }
396
                super.processEndElement();
397
        }
398

    
399
        /** The declarenamespace. */
400
        boolean declarenamespace = false;
401

    
402
        /**
403
         * Declare namespace.
404
         *
405
         * @return the java.lang. object
406
         */
407
        private declareNamespace() {
408
                if (!declarenamespace) {
409
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
410
                        writer.writeNamespace("txm", TXMNS);
411
                        declarenamespace = true;
412
                }
413
        }
414

    
415
        /**
416
         * The main method.
417
         *
418
         * @param args the arguments
419
         */
420
        public static void main(String[] args) {
421

    
422
                String rootDir = "~/xml/rgaqcj/";
423
                new File(rootDir + "/injection/").mkdir();
424

    
425
                def milestones = ["tagUsage", "pb", "lb","catRef"]// the tags who
426

    
427
                File srcfile = new File(rootDir, "/anainline/", "roland.xml");
428
                File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml");
429

    
430
                File src2file = new File(rootDir, "/injection/", "roland.xml");
431
                File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml");
432

    
433
                println("process file : " + srcfile + " with : " + pos1file);
434
                def builder = new AnnotationInjection(srcfile.toURI().toURL(),
435
                                pos1file.toURI().toURL(), milestones);
436
                builder.transfomFile(new File(rootDir + "/injection/", "roland.xml"));
437

    
438
                println("process file : " + src2file + " with : " + pos1file);
439
                builder = new AnnotationInjection(src2file.toURI().toURL(), pos2file.toURI().toURL(),
440
                                milestones);
441
                builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml");
442

    
443
                return;
444
        }
445
}