Statistics
| Revision:

root / tmp / org.txm.treetagger.core / src / org / txm / importer / xmltxm / AnnotationInjectionFromTTOutput.groovy @ 1688

History | View | Annotate | Download (10.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-02 14:17:06 +0100 (mer., 02 mars 2016) $
25
// $LastChangedRevision: 3134 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.xmltxm
29
import javax.xml.stream.*
30

    
31
import org.txm.importer.StaxIdentityParser
32
import org.txm.importer.filters.*
33
import org.txm.importer.PersonalNamespaceContext
34

    
35
/**
36
 * The Class AnnotationInjection.
37
 *
38
 * @author mdecorde
39
 * 
40
 * inject annotation from a stand-off file into a xml-tei-txm file
41
 */
42

    
43
public class AnnotationInjectionFromTTOutput extends StaxIdentityParser {
44

    
45
        public static String TXMNS = "http://textometrie.org/1.0"
46

    
47
        /** The xml reader factory. */
48
        private def factory;
49

    
50
        /** The links. */
51
        private List<String> links;
52

    
53
        /** The linkparsers. key=type*/
54
        private HashMap<String, XMLStreamReader> linkparsers;
55

    
56
        /** The anaurl. */
57
        private def anaurl;
58

    
59
        /** The anainput data. */
60
        private def anainputData;
61

    
62
        /** The anafactory. */
63
        private XMLInputFactory anafactory = XMLInputFactory.newInstance();
64

    
65
        /** The anaparser. */
66
        private XMLStreamReader anaparser;
67
        private XMLStreamReader headerparser;
68

    
69
        /** The resp stmt id. */
70
        String respStmtID = "";
71

    
72
        /** The present taxonomies. */
73
        ArrayList<String> presentTaxonomies = new ArrayList();
74

    
75
        /**
76
         * Instantiates a new annotation injection.
77
         *
78
         * @param url the xml-tei-txm file
79
         * @param anaurl the stand-off file
80
         */
81
        public AnnotationInjectionFromTTOutput(URL url, URL anaurl) {
82
                super(url); // init reader and writer
83
                try {
84
                        this.anaurl = anaurl;
85
                        factory = XMLInputFactory.newInstance();
86
                        this.buildLinkParsers();// build a parser per linkgroup
87
                } catch (XMLStreamException ex) {
88
                        System.out.println(ex);
89
                } catch (IOException ex) {
90
                        System.out.println("IOException while parsing ");
91
                }
92
        }
93

    
94
        private void getHeaderInfos(String containertag, boolean captureTheTag)
95
        {
96
                anainputData = new BufferedInputStream(anaurl.openStream());
97
                headerparser = anafactory.createXMLStreamReader(anainputData);
98
                boolean start = false;
99
                String localname;
100
                for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next())
101
                {
102
                        //String prefix = headerparser.getNamespaceURI();
103
                        if (event == XMLStreamConstants.START_ELEMENT) {
104
                                localname = headerparser.getLocalName();
105
                                if (captureTheTag && localname == containertag) // start copy  after the tag
106
                                        start = true;
107
                                if (start) { // copy header
108
                                        String prefix = headerparser.getPrefix();
109
                                        if (prefix.length() > 0)
110
                                                writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
111
                                        else
112
                                                writer.writeStartElement(localname);
113
                                        for (int i = 0 ; i < headerparser.getNamespaceCount(); i++)
114
                                                writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i));
115
                                        for (int i = 0 ; i < headerparser.getAttributeCount(); i++)
116
                                                writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i));
117
                                }
118
                                if (!captureTheTag && localname == containertag) // start copy  after the tag
119
                                        start = true;
120
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
121
                                localname = headerparser.getLocalName();
122
                                if (!captureTheTag && localname == containertag)
123
                                        break;// stop looping
124

    
125
                                if (start)
126
                                        writer.writeEndElement();
127

    
128
                                if (captureTheTag && localname == containertag)
129
                                        break;// stop looping
130
                        } else if (event == XMLStreamConstants.CHARACTERS) {
131
                                if (start)
132
                                        writer.writeCharacters(headerparser.getText());
133
                        } else if (event == XMLStreamConstants.COMMENT) {
134
                                if (start)
135
                                        writer.writeComment(headerparser.getText());
136
                        }
137
                }
138
                headerparser.close();
139
                anainputData.close();
140
        }
141

    
142
        /**
143
         * find all refs.
144
         *
145
         * @return the list of link parser
146
         */
147
        private List<String> findGrpLink()
148
        {
149
                ArrayList<String> links = new ArrayList<String>();
150
                anainputData = anaurl.openStream();
151
                anaparser = anafactory.createXMLStreamReader(anainputData);
152

    
153
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
154
                        if (event == XMLStreamConstants.START_ELEMENT) {
155
                                if (anaparser.getLocalName().matches("linkGrp")) {
156
                                        String targetsvalue = anaparser.getAttributeValue(0)
157

    
158
                                        if (links.contains(targetsvalue)) {
159
                                                System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
160
                                        } else {
161
                                                links.add(targetsvalue); // add the taxonomy type
162
                                        }
163
                                } else if (anaparser.getLocalName().matches("respStmt")) {
164
                                        respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only
165
                                }
166
                        }
167
                }
168
                anaparser.close();
169
                anainputData.close();
170
                return links;
171
        }
172

    
173
        /**
174
         * Builds the link parsers.
175
         * I need to know what groups exists to build a parser per taxonomy and go to the first link element
176
         */
177
        private void buildLinkParsers() {
178
                // link group of the standoff file
179
                links = findGrpLink();
180
                linkparsers = new HashMap<String, XMLStreamReader>();
181

    
182
                // build one parser per link group
183
                for (String link : links) { // build a parser per group
184
                        anainputData = new BufferedInputStream(anaurl.openStream());
185
                        linkparsers.put(link, anafactory.createXMLStreamReader(anainputData));
186
                }
187

    
188
                //for each parser
189
                for (String link : links) {
190
                        anaparser = linkparsers.get(link);
191
                        for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
192
                                if (event == XMLStreamConstants.START_ELEMENT) {
193
                                        if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
194
                                                String targetsvalue = anaparser.getAttributeValue(0)
195
                                                if (targetsvalue.equals(link)) {
196
                                                        break; // next element is a link start tag
197
                                                }
198
                                        }
199
                                }
200
                        }
201
                }
202
        }
203

    
204
        /**
205
         * get the next tei:link value of a tei:LinkGrp.
206
         *
207
         * @param link the link
208
         * @return the next ana
209
         */
210
        private String getNextAnaValue(String link, String wordId) {
211
                anaparser = linkparsers.get(link);
212
                def m;
213
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
214
                        if (event == XMLStreamConstants.START_ELEMENT) {
215
                                if (anaparser.getLocalName().matches("link")) {
216
                                        String targetsvalue = anaparser.getAttributeValue(0)
217
                                        if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
218
                                                def g1 = m[0][1];
219
                                                def g2 = m[0][2];
220

    
221
                                                String anavalue = g2;
222
                                                anavalue = anavalue.replace("&lt;", "<")
223
                                                return anavalue;
224
                                        } else {
225
                                                System.err.println("Error: getNextAna(): link target is not well formed:  = "+anaparser.getAttributeValue(0));
226
                                        }
227
                                }
228
                        }
229
                }
230
                anaparser.close();
231
                return "";
232
        }
233

    
234
        /**
235
         * build the ana tags of a word.
236
         *
237
         * @param wordId the word id
238
         * @return the ana tag
239
         */
240
        private void writeAnaTag(String wordId)
241
        {
242
                String anabalises ="\n";
243
                for (String link : links) {
244
                        writer.writeStartElement(TXMNS, "ana");
245
                        writer.writeAttribute("resp", "#"+respStmtID);
246
                        writer.writeAttribute("type", "#"+link);
247
                        writer.writeCharacters(getNextAnaValue(link, wordId));
248
                        writer.writeEndElement(); // txm:ana
249
                }
250
        }
251

    
252
        String wordId;
253
        boolean flagSourceDesc = false;
254
        protected void processStartElement()
255
        {
256
                if (parser.getLocalName().matches("taxonomy")) {
257
                        String taxo = parser.getAttributeValue(0) // taxonomy type
258
                        presentTaxonomies.add(taxo);
259
                }
260

    
261
                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
262
                        if (parser.getAttributeLocalName(i) == "id") {
263
                                wordId = parser.getAttributeValue(i);
264
                                break
265
                        }
266
                }
267
                super.processStartElement();
268
        }
269

    
270
        boolean applicationWritten = false;
271
        boolean taxonomiesWritten = false;
272
        protected void processEndElement() {
273
                switch (parser.getLocalName()) {
274
                        case "w":
275
                                writeAnaTag(wordId);
276
                                break;
277

    
278
                        case "appInfo":
279
                                applicationWritten = true;
280
                                getHeaderInfos("appInfo", false);
281
                                break;
282

    
283
                        case "classDecl":
284
                                taxonomiesWritten = true;
285
                                getHeaderInfos("classDecl", false);
286
                                break;
287

    
288
                        case "encodingDesc":
289
                                if (!applicationWritten) {
290
                                        writer.writeStartElement("appInfo");
291
                                        getHeaderInfos("appInfo", false);
292
                                        writer.writeEndElement(); // appInfo
293
                                }
294
                                if (!taxonomiesWritten) {
295
                                        writer.writeStartElement("classDecl");
296
                                        getHeaderInfos("classDecl", false);
297
                                        writer.writeEndElement(); // classDecl
298
                                }
299
                                break;
300

    
301
                        case "titleStmt":
302
                                if (flagSourceDesc) {
303
                                        //output.write(this.respStmt+"\n")
304
                                        getHeaderInfos("respStmt", true);
305
                                        flagSourceDesc = false;
306
                                        break;
307
                                }
308
                                break;
309
                }
310
                super.processEndElement();
311
        }
312

    
313
        /** The declarenamespace. */
314
        boolean declarenamespace = false;
315

    
316
        /**
317
         * Declare namespace.
318
         *
319
         * @return the java.lang. object
320
         */
321
        private declareNamespace() {
322
                if (!declarenamespace) {
323
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
324
                        writer.writeNamespace("txm", TXMNS);
325
                        declarenamespace = true;
326
                }
327
        }
328

    
329
        /**
330
         * The main method.
331
         *
332
         * @param args the arguments
333
         */
334
        public static void main(String[] args) {
335

    
336
                String rootDir = "~/xml/rgaqcj/";
337
                new File(rootDir + "/injection/").mkdir();
338

    
339
                def milestones = [
340
                        "tagUsage",
341
                        "pb",
342
                        "lb",
343
                        "catRef"]// the tags who
344
                File srcfile = new File(rootDir, "/anainline/", "roland.xml");
345
                File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml");
346

    
347
                File src2file = new File(rootDir, "/injection/", "roland.xml");
348
                File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml");
349

    
350
                println("process file : " + srcfile + " with : " + pos1file);
351
                def builder = new AnnotationInjectionFromTTOutput(srcfile.toURI().toURL(),
352
                                pos1file.toURI().toURL(), milestones);
353
                builder.transfomFile(new File(rootDir + "/injection/", "roland.xml"));
354

    
355
                println("process file : " + src2file + " with : " + pos1file);
356
                builder = new AnnotationInjectionFromTTOutput(src2file.toURI().toURL(), pos2file.toURI().toURL(),
357
                                milestones);
358
                builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml");
359

    
360
                return;
361
        }
362
}