Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / AnnotationInjection.groovy @ 479

History | View | Annotate | Download (12.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-29 09:51:35 +0200 (mar. 29 mars 2016) $
25
// $LastChangedRevision: 3185 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29
import javax.xml.stream.*
30

    
31
import org.txm.importer.StaxIdentityParser
32
import org.txm.importer.filters.*
33
import org.txm.importer.graal.PersonalNamespaceContext
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class AnnotationInjection.
37
 *
38
 * @author mdecorde
39
 * 
40
 * inject annotation from a stand-off file into a xml-tei-txm
41
 * file
42
 */
43

    
44
public class AnnotationInjection extends StaxIdentityParser {
45

    
46
        public static String TXMNS = "http://textometrie.org/1.0"
47

    
48
        /** The xml reader factory. */
49
        private def factory;
50

    
51
        /** The links. */
52
        private LinkedHashSet<String> links;
53
        boolean replace
54

    
55
        /** The linkparsers. key=type*/
56
        private LinkedHashMap<String, XMLStreamReader> linkparsers;
57

    
58
        /** The anaurl. */
59
        private def anaurl;
60

    
61
        /** The anainput data. */
62
        private def anainputData;
63

    
64
        /** The anafactory. */
65
        private XMLInputFactory anafactory = XMLInputFactory.newInstance();
66

    
67
        /** The anaparser. */
68
        private XMLStreamReader anaparser;
69
        private XMLStreamReader headerparser;
70

    
71
        /** The resp stmt id. */
72
        String respStmtID = "";
73

    
74
        /** The present taxonomies. */
75
        ArrayList<String> presentTaxonomies = new ArrayList();
76

    
77
        /**
78
         * Instantiates a new annotation injection.
79
         *
80
         * @param url the xml-tei-txm file
81
         * @param anaurl the stand-off file
82
         */
83
        public AnnotationInjection(URL url, URL anaurl) {
84
                this(url, anaurl, false)
85
        }
86

    
87
        /**
88
         * Instantiates a new annotation injection.
89
         *
90
         * @param url the xml-tei-txm file
91
         * @param anaurl the stand-off file
92
         */
93
        public AnnotationInjection(URL url, URL anaurl, boolean replace) {
94
                super(url); // init reader and writer
95
                try {
96
                        this.anaurl = anaurl;
97
                        this.replace = replace
98
                        factory = XMLInputFactory.newInstance();
99
                        this.buildLinkParsers();// build a parser per linkgroup
100
                } catch (XMLStreamException ex) {
101
                        System.out.println(ex);
102
                } catch (IOException ex) {
103
                        System.out.println("IOException while parsing ");
104
                }
105
        }
106

    
107
        private void getHeaderInfos(String containertag, boolean captureTheTag)
108
        {
109
                anainputData = new BufferedInputStream(anaurl.openStream());
110
                headerparser = anafactory.createXMLStreamReader(anainputData);
111
                boolean start = false;
112
                String localname;
113
                for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next())
114
                {
115
                        //String prefix = headerparser.getNamespaceURI();
116
                        if (event == XMLStreamConstants.START_ELEMENT) {
117
                                localname = headerparser.getLocalName();
118
                                if (captureTheTag && localname == containertag) // start copy  after the tag
119
                                        start = true;
120
                                if (start) { // copy header
121
                                        String prefix = headerparser.getPrefix();
122
                                        if (prefix.length() > 0)
123
                                                writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
124
                                        else
125
                                                writer.writeStartElement(localname);
126
                                        for (int i = 0 ; i < headerparser.getNamespaceCount(); i++)
127
                                                writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i));
128
                                        for (int i = 0 ; i < headerparser.getAttributeCount(); i++)
129
                                                writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i));
130
                                }
131
                                if (!captureTheTag && localname == containertag) // start copy  after the tag
132
                                        start = true;
133
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
134
                                localname = headerparser.getLocalName();
135
                                if (!captureTheTag && localname == containertag)
136
                                        break;// stop looping
137

    
138
                                if (start)
139
                                        writer.writeEndElement();
140

    
141
                                if (captureTheTag && localname == containertag)
142
                                        break;// stop looping
143
                        } else if (event == XMLStreamConstants.CHARACTERS) {
144
                                if (start)
145
                                        writer.writeCharacters(headerparser.getText());
146
                        } else if (event == XMLStreamConstants.COMMENT) {
147
                                if (start)
148
                                        writer.writeComment(headerparser.getText());
149
                        }
150
                }
151
                headerparser.close();
152
        }
153

    
154
        /**
155
         * find all refs.
156
         *
157
         * @return the list of link parser
158
         */
159
        private LinkedHashSet<String> findGrpLink()
160
        {
161
                LinkedHashSet<String> links = new LinkedHashSet<String>();
162
                anainputData = anaurl.openStream();
163
                anaparser = anafactory.createXMLStreamReader(anainputData);
164

    
165
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
166
                        if (event == XMLStreamConstants.START_ELEMENT) {
167
                                if (anaparser.getLocalName().equals("linkGrp")) {
168
                                        String targetsvalue = anaparser.getAttributeValue(0)
169

    
170
                                        if (links.contains(targetsvalue)) {
171
                                                System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
172
                                        } else {
173
                                                links.add(targetsvalue); // add the taxonomy type
174
                                        }
175
                                } else if (anaparser.getLocalName().equals("respStmt")) {
176
                                        respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only
177
                                }
178
                        }
179
                }
180
                anaparser.close();
181
                return links;
182
        }
183

    
184
        /**
185
         * Builds the link parsers.
186
         * I need to know what groups exists to build a parser per taxonomy and go to the first link element
187
         */
188
        private void buildLinkParsers()
189
        {
190
                // link group of the standoff file
191
                links = findGrpLink();
192
                linkparsers = new LinkedHashMap<String, XMLStreamReader>();
193

    
194
                // build one parser per link group
195
                for (String link : links) { // build a parser per group
196
                        anainputData = new BufferedInputStream(anaurl.openStream());
197
                        linkparsers.put(link, anafactory.createXMLStreamReader(anainputData));
198
                }
199

    
200
                //for each parser
201
                for (String link : links) {
202
                        anaparser = linkparsers.get(link);
203
                        for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
204
                                if (event == XMLStreamConstants.START_ELEMENT) {
205
                                        if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
206
                                                String targetsvalue = anaparser.getAttributeValue(0)
207
                                                if (targetsvalue.equals(link))
208
                                                        break; // next element is a link start tag
209
                                        }
210
                                }
211
                        }
212
                }
213
        }
214

    
215
        /**
216
         * get the next tei:link value of a tei:LinkGrp.
217
         *
218
         * @param link the link
219
         * @return the next ana
220
         */
221
        private String getNextAnaValue(String link, String wordId)
222
        {
223
                anaparser = linkparsers.get(link);
224
                def m;
225
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
226
                        if (event == XMLStreamConstants.START_ELEMENT) {
227
                                if (anaparser.getLocalName().equals("link")) {
228
                                        String targetsvalue = anaparser.getAttributeValue(0)
229
                                        if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
230
                                                def g1 = m[0][1];
231
                                                def g2 = m[0][2];
232

    
233
                                                String anavalue = g2;
234
                                                anavalue = anavalue.replace("&lt;", "<")
235
                                                return anavalue;
236
                                        } else {
237
                                                System.err.println("Error: getNextAna(): link target is not well formed:  = "+anaparser.getAttributeValue(0));
238
                                        }
239
                                }
240
                        }
241
                }
242
                return "";
243
        }
244

    
245
        /**
246
         * build the ana tags of a word.
247
         *
248
         * @param wordId the word id
249
         * @return the ana tag
250
         */
251
        private void writeAnaTag(String wordId)
252
        {
253
                String anabalises ="\n";
254
                for (String link : links) {
255
                        writer.writeStartElement(TXMNS, "ana");
256
                        writer.writeAttribute("resp", "#"+respStmtID);
257
                        writer.writeAttribute("type", "#"+link);
258
                        if (replace) {
259
                                if (linkparsers.containsKey(link)) { // order is important
260
                                        writer.writeCharacters(getNextAnaValue(link, wordId));
261
                                } else {
262
                                        writer.writeCharacters(anaValues.get(link));
263
                                }
264
                        } else { // ok no problem
265
                                writer.writeCharacters(getNextAnaValue(link, wordId));
266
                        }
267
                        
268
                        writer.writeEndElement(); // txm:ana
269
                }
270
        }
271

    
272
        String wordId;
273
        HashMap<String, String> anaValues = new HashMap<String, String>();
274
        boolean flagSourceDesc = false, flagW = false, flagAna = false;
275
        String type = null, resp = null, anaValue= "";
276
        protected void processStartElement()
277
        {
278
                if (localname.equals("taxonomy")) {
279
                        String taxo = parser.getAttributeValue(0) // taxonomy type
280
                        presentTaxonomies.add(taxo);
281
                } else if (flagW && replace && localname.equals("ana")) {
282
                        flagAna = true
283
                        anaValue= ""
284
                        type = null
285
                        resp = null
286
                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
287
                                if (parser.getAttributeLocalName(i) == "resp") {
288
                                        resp = parser.getAttributeValue(i);
289
                                } else if (parser.getAttributeLocalName(i) == "type") {
290
                                        type = parser.getAttributeValue(i);
291
                                }
292
                        }
293
                        if (type != null) type = type.substring(1);
294
                        if (resp != null) resp = resp.substring(1);
295
                        return; // don't write the "ana" start element
296
                } else if (localname.equals("w")) {
297
                        for (int i= 0 ; i < parser.getAttributeCount() ; i++ ) {
298
                                if (parser.getAttributeLocalName(i) == "id") {
299
                                        wordId = parser.getAttributeValue(i);
300
                                        break
301
                                }
302
                        }
303
                        flagW = true
304
                        anaValues.clear()
305
                }
306

    
307
                super.processStartElement();
308
        }
309

    
310
        protected void processCharacters() {
311
                if (flagAna) anaValue += parser.getText();
312
                else super.processCharacters(); // FORM CONTENT LOST !!!!!!!!!!!!!
313
        };
314

    
315
        boolean applicationWritten = false;
316
        boolean taxonomiesWritten = false;
317
        protected void processEndElement() {
318
                switch (parser.getLocalName()) {
319
                        case "w":
320
                                writeAnaTag(wordId);
321
                                flagW = false
322
                                break;
323
                        case "ana":
324
                                if (flagAna && replace && type != null && resp != null && anaValue != null) {
325
                                        anaValues.put(type, anaValue)
326
                                        links.add(type)
327
                                        flagAna = false
328
                                        return; // don't write the "ana" end element
329
                                }
330
                                flagAna = false
331
                                break;
332

    
333
                        case "appInfo":
334
                                applicationWritten = true;
335
                                getHeaderInfos("appInfo", false);
336
                                break;
337

    
338
                        case "classDecl":
339
                                taxonomiesWritten = true;
340
                                getHeaderInfos("classDecl", false);
341
                                break;
342

    
343
                        case "encodingDesc":
344
                                if (!applicationWritten) {
345
                                        writer.writeStartElement("appInfo");
346
                                        getHeaderInfos("appInfo", false);
347
                                        writer.writeEndElement(); // appInfo
348
                                }
349
                                if (!taxonomiesWritten) {
350
                                        writer.writeStartElement("classDecl");
351
                                        getHeaderInfos("classDecl", false);
352
                                        writer.writeEndElement(); // classDecl
353
                                }
354
                                break;
355

    
356
                        case "titleStmt":
357
                                if (flagSourceDesc) {
358
                                        //output.write(this.respStmt+"\n")
359
                                        getHeaderInfos("respStmt", true);
360
                                        flagSourceDesc = false;
361
                                        break;
362
                                }
363
                                break;
364
                }
365
                super.processEndElement();
366
        }
367

    
368
        /** The declarenamespace. */
369
        boolean declarenamespace = false;
370

    
371
        /**
372
         * Declare namespace.
373
         *
374
         * @return the java.lang. object
375
         */
376
        private declareNamespace() {
377
                if (!declarenamespace) {
378
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
379
                        writer.writeNamespace("txm", TXMNS);
380
                        declarenamespace = true;
381
                }
382
        }
383

    
384
        /**
385
         * The main method.
386
         *
387
         * @param args the arguments
388
         */
389
        public static void main(String[] args) {
390

    
391
                String rootDir = "~/xml/rgaqcj/";
392
                new File(rootDir + "/injection/").mkdir();
393

    
394
                def milestones = ["tagUsage", "pb", "lb","catRef"]// the tags who
395

    
396
                File srcfile = new File(rootDir, "/anainline/", "roland.xml");
397
                File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml");
398

    
399
                File src2file = new File(rootDir, "/injection/", "roland.xml");
400
                File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml");
401

    
402
                println("process file : " + srcfile + " with : " + pos1file);
403
                def builder = new AnnotationInjection(srcfile.toURI().toURL(),
404
                                pos1file.toURI().toURL(), milestones);
405
                builder.transfomFile(new File(rootDir + "/injection/", "roland.xml"));
406

    
407
                println("process file : " + src2file + " with : " + pos1file);
408
                builder = new AnnotationInjection(src2file.toURI().toURL(), pos2file.toURI().toURL(),
409
                                milestones);
410
                builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml");
411

    
412
                return;
413
        }
414
}