Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / AnnotationInjectionFromTTOutput.groovy @ 479

History | View | Annotate | Download (10.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2016-03-02 14:17:06 +0100 (mer., 02 mars 2016) $
25
// $LastChangedRevision: 3134 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29
import javax.xml.stream.*
30

    
31
import org.txm.importer.StaxIdentityParser
32
import org.txm.importer.filters.*
33
import org.txm.importer.graal.PersonalNamespaceContext
34
// TODO: Auto-generated Javadoc
35
/**
36
 * The Class AnnotationInjection.
37
 *
38
 * @author mdecorde
39
 * 
40
 * inject annotation from a stand-off file into a xml-tei-txm
41
 * file
42
 */
43

    
44
public class AnnotationInjectionFromTTOutput extends StaxIdentityParser {
45

    
46
        public static String TXMNS = "http://textometrie.org/1.0"
47

    
48
        /** The xml reader factory. */
49
        private def factory;
50

    
51
        /** The links. */
52
        private List<String> links;
53

    
54
        /** The linkparsers. key=type*/
55
        private HashMap<String, XMLStreamReader> linkparsers;
56

    
57
        /** The anaurl. */
58
        private def anaurl;
59

    
60
        /** The anainput data. */
61
        private def anainputData;
62

    
63
        /** The anafactory. */
64
        private XMLInputFactory anafactory = XMLInputFactory.newInstance();
65

    
66
        /** The anaparser. */
67
        private XMLStreamReader anaparser;
68
        private XMLStreamReader headerparser;
69

    
70
        /** The resp stmt id. */
71
        String respStmtID = "";
72

    
73
        /** The present taxonomies. */
74
        ArrayList<String> presentTaxonomies = new ArrayList();
75

    
76
        /**
77
         * Instantiates a new annotation injection.
78
         *
79
         * @param url the xml-tei-txm file
80
         * @param anaurl the stand-off file
81
         */
82
        public AnnotationInjectionFromTTOutput(URL url, URL anaurl) {
83
                super(url); // init reader and writer
84
                try {
85
                        this.anaurl = anaurl;
86
                        factory = XMLInputFactory.newInstance();
87
                        this.buildLinkParsers();// build a parser per linkgroup
88
                } catch (XMLStreamException ex) {
89
                        System.out.println(ex);
90
                } catch (IOException ex) {
91
                        System.out.println("IOException while parsing ");
92
                }
93
        }
94

    
95
        private void getHeaderInfos(String containertag, boolean captureTheTag)
96
        {
97
                anainputData = new BufferedInputStream(anaurl.openStream());
98
                headerparser = anafactory.createXMLStreamReader(anainputData);
99
                boolean start = false;
100
                String localname;
101
                for (int event = headerparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = headerparser.next())
102
                {
103
                        //String prefix = headerparser.getNamespaceURI();
104
                        if (event == XMLStreamConstants.START_ELEMENT) {
105
                                localname = headerparser.getLocalName();
106
                                if (captureTheTag && localname == containertag) // start copy  after the tag
107
                                        start = true;
108
                                if (start) { // copy header
109
                                        String prefix = headerparser.getPrefix();
110
                                        if (prefix.length() > 0)
111
                                                writer.writeStartElement(Nscontext.getNamespaceURI(prefix), localname)
112
                                        else
113
                                                writer.writeStartElement(localname);
114
                                        for (int i = 0 ; i < headerparser.getNamespaceCount(); i++)
115
                                                writer.writeNamespace(headerparser.getNamespacePrefix(i), headerparser.getNamespaceURI(i));
116
                                        for (int i = 0 ; i < headerparser.getAttributeCount(); i++)
117
                                                writer.writeAttribute(headerparser.getAttributeLocalName(i), headerparser.getAttributeValue(i));
118
                                }
119
                                if (!captureTheTag && localname == containertag) // start copy  after the tag
120
                                        start = true;
121
                        } else if (event == XMLStreamConstants.END_ELEMENT) {
122
                                localname = headerparser.getLocalName();
123
                                if (!captureTheTag && localname == containertag)
124
                                        break;// stop looping
125

    
126
                                if (start)
127
                                        writer.writeEndElement();
128

    
129
                                if (captureTheTag && localname == containertag)
130
                                        break;// stop looping
131
                        } else if (event == XMLStreamConstants.CHARACTERS) {
132
                                if (start)
133
                                        writer.writeCharacters(headerparser.getText());
134
                        } else if (event == XMLStreamConstants.COMMENT) {
135
                                if (start)
136
                                        writer.writeComment(headerparser.getText());
137
                        }
138
                }
139
                headerparser.close();
140
        }
141

    
142
        /**
143
         * find all refs.
144
         *
145
         * @return the list of link parser
146
         */
147
        private List<String> findGrpLink()
148
        {
149
                ArrayList<String> links = new ArrayList<String>();
150
                anainputData = anaurl.openStream();
151
                anaparser = anafactory.createXMLStreamReader(anainputData);
152

    
153
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
154
                        if (event == XMLStreamConstants.START_ELEMENT) {
155
                                if (anaparser.getLocalName().matches("linkGrp")) {
156
                                        String targetsvalue = anaparser.getAttributeValue(0)
157

    
158
                                        if (links.contains(targetsvalue)) {
159
                                                System.err.println("Warning: Multiple group declaration : "+targetsvalue+" has already been added, the first one will be used")
160
                                        } else {
161
                                                links.add(targetsvalue); // add the taxonomy type
162
                                        }
163
                                } else if (anaparser.getLocalName().matches("respStmt")) {
164
                                        respStmtID = anaparser.getAttributeValue(0); // one attribute (id) only
165
                                }
166
                        }
167
                }
168
                anaparser.close();
169
                return links;
170
        }
171

    
172
        /**
173
         * Builds the link parsers.
174
         * I need to know what groups exists to build a parser per taxonomy and go to the first link element
175
         */
176
        private void buildLinkParsers()
177
        {
178
                // link group of the standoff file
179
                links = findGrpLink();
180
                linkparsers = new HashMap<String, XMLStreamReader>();
181

    
182
                // build one parser per link group
183
                for (String link : links) { // build a parser per group
184
                        anainputData = new BufferedInputStream(anaurl.openStream());
185
                        linkparsers.put(link, anafactory.createXMLStreamReader(anainputData));
186
                }
187

    
188
                //for each parser
189
                for (String link : links) {
190
                        anaparser = linkparsers.get(link);
191
                        for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
192
                                if (event == XMLStreamConstants.START_ELEMENT) {
193
                                        if (anaparser.getLocalName().equals("linkGrp")) { // position the parser to the right group
194
                                                String targetsvalue = anaparser.getAttributeValue(0)
195
                                                if (targetsvalue.equals(link))
196
                                                        break; // next element is a link start tag
197
                                        }
198
                                }
199
                        }
200
                }
201
        }
202

    
203
        /**
204
         * get the next tei:link value of a tei:LinkGrp.
205
         *
206
         * @param link the link
207
         * @return the next ana
208
         */
209
        private String getNextAnaValue(String link, String wordId)
210
        {
211
                anaparser = linkparsers.get(link);
212
                def m;
213
                for (int event = anaparser.next(); event != XMLStreamConstants.END_DOCUMENT; event = anaparser.next()) {
214
                        if (event == XMLStreamConstants.START_ELEMENT) {
215
                                if (anaparser.getLocalName().matches("link")) {
216
                                        String targetsvalue = anaparser.getAttributeValue(0)
217
                                        if ((m = targetsvalue =~ /#(.*) #(.*)/)) { // balise externe
218
                                                def g1 = m[0][1];
219
                                                def g2 = m[0][2];
220

    
221
                                                String anavalue = g2;
222
                                                anavalue = anavalue.replace("&lt;", "<")
223
                                                return anavalue;
224
                                        } else {
225
                                                System.err.println("Error: getNextAna(): link target is not well formed:  = "+anaparser.getAttributeValue(0));
226
                                        }
227
                                }
228
                        }
229
                }
230
                return "";
231
        }
232

    
233
        /**
234
         * build the ana tags of a word.
235
         *
236
         * @param wordId the word id
237
         * @return the ana tag
238
         */
239
        private void writeAnaTag(String wordId)
240
        {
241
                String anabalises ="\n";
242
                for (String link : links) {
243
                        writer.writeStartElement(TXMNS, "ana");
244
                        writer.writeAttribute("resp", "#"+respStmtID);
245
                        writer.writeAttribute("type", "#"+link);
246
                        writer.writeCharacters(getNextAnaValue(link, wordId));
247
                        writer.writeEndElement(); // txm:ana
248
                }
249
        }
250

    
251
        String wordId;
252
        boolean flagSourceDesc = false;
253
        protected void processStartElement()
254
        {
255
                if (parser.getLocalName().matches("taxonomy")) {
256
                        String taxo = parser.getAttributeValue(0) // taxonomy type
257
                        presentTaxonomies.add(taxo);
258
                }
259

    
260
                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) {
261
                        if (parser.getAttributeLocalName(i) == "id") {
262
                                wordId = parser.getAttributeValue(i);
263
                                break
264
                        }
265
                }
266
                super.processStartElement();
267
        }
268

    
269
        boolean applicationWritten = false;
270
        boolean taxonomiesWritten = false;
271
        protected void processEndElement() {
272
                switch (parser.getLocalName()) {
273
                        case "w":
274
                                writeAnaTag(wordId);
275
                                break;
276

    
277
                        case "appInfo":
278
                                applicationWritten = true;
279
                                getHeaderInfos("appInfo", false);
280
                                break;
281

    
282
                        case "classDecl":
283
                                taxonomiesWritten = true;
284
                                getHeaderInfos("classDecl", false);
285
                                break;
286

    
287
                        case "encodingDesc":
288
                                if (!applicationWritten) {
289
                                        writer.writeStartElement("appInfo");
290
                                        getHeaderInfos("appInfo", false);
291
                                        writer.writeEndElement(); // appInfo
292
                                }
293
                                if (!taxonomiesWritten) {
294
                                        writer.writeStartElement("classDecl");
295
                                        getHeaderInfos("classDecl", false);
296
                                        writer.writeEndElement(); // classDecl
297
                                }
298
                                break;
299
                                
300
                        case "titleStmt":
301
                                if (flagSourceDesc) {
302
                                        //output.write(this.respStmt+"\n")
303
                                        getHeaderInfos("respStmt", true);
304
                                        flagSourceDesc = false;
305
                                        break;
306
                                }
307
                                break;
308
                }
309
                super.processEndElement();
310
        }
311

    
312
        /** The declarenamespace. */
313
        boolean declarenamespace = false;
314

    
315
        /**
316
         * Declare namespace.
317
         *
318
         * @return the java.lang. object
319
         */
320
        private declareNamespace() {
321
                if (!declarenamespace) {
322
                        writer.writeDefaultNamespace("http://www.tei-c.org/ns/1.0");
323
                        writer.writeNamespace("txm", TXMNS);
324
                        declarenamespace = true;
325
                }
326
        }
327

    
328
        /**
329
         * The main method.
330
         *
331
         * @param args the arguments
332
         */
333
        public static void main(String[] args) {
334

    
335
                String rootDir = "~/xml/rgaqcj/";
336
                new File(rootDir + "/injection/").mkdir();
337

    
338
                def milestones = ["tagUsage", "pb", "lb","catRef"]// the tags who
339

    
340
                File srcfile = new File(rootDir, "/anainline/", "roland.xml");
341
                File pos1file = new File(rootDir, "/pos/", "rolandTT1-w-ana.xml");
342

    
343
                File src2file = new File(rootDir, "/injection/", "roland.xml");
344
                File pos2file = new File(rootDir, "/pos/", "rolandTT2-w-ana.xml");
345
                
346
                println("process file : " + srcfile + " with : " + pos1file);
347
                def builder = new AnnotationInjectionFromTTOutput(srcfile.toURI().toURL(),
348
                                pos1file.toURI().toURL(), milestones);
349
                builder.transfomFile(new File(rootDir + "/injection/", "roland.xml"));
350

    
351
                println("process file : " + src2file + " with : " + pos1file);
352
                builder = new AnnotationInjectionFromTTOutput(src2file.toURI().toURL(), pos2file.toURI().toURL(),
353
                                milestones);
354
                builder.transfomFile(rootDir + "/injection/", "roland-FINAL.xml");
355

    
356
                return;
357
        }
358
}