Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / AnnotationExtraction.groovy @ 479

History | View | Annotate | Download (9.3 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25
// $LastChangedRevision: 3400 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29

    
30
import java.util.ArrayList;
31
import java.util.HashMap;
32
import javax.xml.stream.*;
33
import java.net.URL;
34
import org.txm.importer.filters.*;
35
// TODO: Auto-generated Javadoc
36

    
37
/**
38
 * The Class AnnotationExtraction.
39
 *
40
 * @author mdecorde
41
 * 
42
 * Extract ana tags from a xml-tei-txm file
43
 * saved into a stand-off file
44
 */
45
public class AnnotationExtraction
46
{
47
        /** The input data. */
48
        private def inputData;
49

    
50
        /** The factory. */
51
        private def factory;
52

    
53
        /** The parser. */
54
        private XMLStreamReader parser;
55

    
56
        /** The dir. */
57
        private def dir;
58

    
59
        /** The writer. */
60
        private Writer writer;
61

    
62
        /** The url. */
63
        private def url;
64

    
65
        /** The posfile. */
66
        private def posfile;
67

    
68
        /** The anafile. */
69
        private def anafile;
70

    
71
        /** The encoding. */
72
        private def encoding;
73

    
74
        /** The extract types. */
75
        private List<String> extractTypes;
76

    
77
        /** The resp stmt. */
78
        private String respStmt;
79

    
80
        /** The solotags. */
81
        ArrayList<String> solotags;
82

    
83
        /**
84
         * initialize.
85
         *
86
         * @param anafile the path to the xml-tei-txm file
87
         * @param posfile the output stand-off file
88
         * @param encoding the encoding of the xml-tei-txm file
89
         * @param milestones milestones tags (temporary)
90
         */
91
        public AnnotationExtraction(File anafile, File posfile,String encoding,milestones) {
92
                this.url = anafile.toURI().toURL();
93
                this.posfile = posfile;
94
                this.encoding = encoding;
95
                this.solotags = milestones;
96

    
97
                inputData = url.openStream();
98
                factory = XMLInputFactory.newInstance();
99
        }
100

    
101
        /**
102
         * process !!!.
103
         */
104
        public void process() {
105
                assert(extractTypes != null);
106
                if (this.createOutput(posfile,encoding)) {
107
                        this.writeHead();
108
                        this.writeBody( posfile, encoding);
109
                        this.writeTail();
110
                }
111
                writer.close();
112
        }
113

    
114
        /**
115
         * Creates the output.
116
         *
117
         * @param outfile the outfile
118
         * @param encoding the encoding
119
         * @return true, if successful
120
         */
121
        private boolean createOutput(File outfile, String encoding){
122
                try {
123
                        writer = new OutputStreamWriter(new FileOutputStream(outfile) , "UTF-8");
124
                        return true;
125
                } catch (Exception e) {
126
                        System.out.println(e.getLocalizedMessage());
127
                        return false;
128
                }
129
        }
130

    
131
        /**
132
         * set the respStmt tag of the stand-off file by reading the respstmt of the xml-tei-txm file.
133
         */
134
        private void setResp()
135
        {
136
                this.respStmt = "";
137
                boolean begincapture = false;
138
                String lastopenlocalname= "";
139

    
140
                inputData = url.openStream();
141
                parser = factory.createXMLStreamReader(inputData);
142

    
143
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
144
                {
145
                        String prefix = parser.getPrefix();
146
                        if (prefix == null || prefix == "")
147
                                prefix = "";
148
                        else
149
                                prefix +=":";
150
                        if (event == XMLStreamConstants.START_ELEMENT) {
151
                                if (parser.getLocalName().matches("resp")) {
152
                                        String id = parser.getAttributeValue("xml","id")
153
                                        if (id == null)
154
                                                id = parser.getAttributeValue(null,"id")
155
                                        if (extractTypes.contains(id)) {
156
                                                this.respStmt = "<respStmt>\n";
157
                                                begincapture= true;
158
                                        }
159
                                }
160
                                if (begincapture) {
161
                                        lastopenlocalname = parser.getLocalName();
162
                                        respStmt += ("\n<"+prefix+parser.getLocalName());
163
                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
164
                                                respStmt += (" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"" );
165
                                        if (solotags.contains(lastopenlocalname))
166
                                                respStmt += ("/>");
167
                                        else
168
                                                respStmt += (">");
169
                                }
170
                        }
171
                        else if (event == XMLStreamConstants.END_ELEMENT) {
172
                                if (parser.getLocalName().matches("respStmt")) {
173
                                        begincapture= false;
174
                                        this.respStmt += "\n</respStmt>"
175
                                        System.out.println(this.respStmt);
176
                                        parser.close();
177
                                        return;
178
                                } else {
179
                                        if (!solotags.contains(parser.getLocalName()))
180
                                                if (lastopenlocalname.equals(parser.getLocalName()))
181
                                                        respStmt += ("</"+prefix+parser.getLocalName()+">");
182
                                                else
183
                                                        respStmt += ("\n</"+prefix+parser.getLocalName()+">");
184
                                }
185
                        } else if (event == XMLStreamConstants.CHARACTERS) {
186
                                String txt = parser.getText().trim();
187
                                respStmt += txt;
188
                        }
189
                }
190
        }
191

    
192
        /**
193
         * write the header of the stand-off ile.
194
         */
195
        private void writeHead() {
196
                this.setResp();
197

    
198
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
199
                writer.write("<!DOCTYPE TEI SYSTEM \"tei_bfm_v1.dtd\">\n");
200
                writer.write("<TEI xmlns:txm=\"http://textometrie.ens-lyon.fr/1.0\">\n");
201
                writer.write("<teiHeader xml:lang=\"eng\">\n");
202
                writer.write("<fileDesc>\n");
203
                writer.write("<titleStmt>\n");
204
                for (int i=0;i < extractTypes.size(); i++)
205
                        writer.write("<title>"+extractTypes[i]+"</title>\n");
206
                writer.write(this.respStmt+"\n");
207
                writer.write("</titleStmt>\n");
208
                writer.write("<publicationStmt>\n");
209
                writer.write("<distributor>BFM project - http://bfm.ens-lsh.fr</distributor>\n");
210
                writer.write("<availability>\n");
211
                writer.write("<p>(c) 2010 Projet BFM - CNRS/ENS-LSH.\n");
212
                writer.write("<hi>Conditions d'utilisation</hi> : \n");
213
                writer.write("Sous licence <ref target=\"http://creativecommons.org/licenses/by-sa/2.0/fr/\">Creative Commons</ref>.\n");
214
                writer.write("</p>\n");
215
                writer.write("</availability>\n");
216
                writer.write("</publicationStmt>\n");
217
                writer.write("<sourceDesc>\n");
218
                writer.write("<p>born digital : TXM project - http://textometrie.org</p>\n");
219
                writer.write("</sourceDesc>\n");
220
                writer.write("</fileDesc>\n");
221
                writer.write("</teiHeader>\n");
222
                writer.write("<text xml:lang=\"fr\" type=\"standoff\">\n");
223
                writer.write("<body>\n");
224
                writer.write("<div>\n");
225
        }
226

    
227
        /**
228
         * write the group of ana tags with the attribute type = type.
229
         *
230
         * @param type the type
231
         */
232
        private void writeGroup(String type)
233
        {
234
                System.out.println("write group : "+type);
235
                inputData = url.openStream();
236
                factory = XMLInputFactory.newInstance();
237
                parser = factory.createXMLStreamReader(inputData);
238

    
239
                boolean flagAna = false;
240
                boolean flagForm = false;
241
                boolean flagW = false;
242
                String wordid="";
243
                String vAna="";
244
                String currentType;
245

    
246
                writer.write("<linkGrp type=\""+type+"\">\n")
247

    
248
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
249
                {
250
                        switch (event) {
251
                                case XMLStreamConstants.START_ELEMENT:
252
                                        switch (parser.getLocalName()) {
253
                                                case "w":
254
                                                wordid = parser.getAttributeValue(null,"id");
255
                                                flagW = true;
256
                                                break;
257
                                                case "ana":
258
                                                if (flagW) {
259
                                                        currentType = (parser.getAttributeValue(null,"type"));
260
                                                        if (currentType.matches(type)) {
261
                                                                flagAna = true;
262
                                                                vAna ="";
263
                                                        }
264
                                                }
265
                                                break;
266
                                        }
267
                                        break;
268

    
269
                                case XMLStreamConstants.END_ELEMENT:
270
                                        switch (parser.getLocalName()) {
271
                                                case "w":
272
                                                flagW = false;
273
                                                break;
274

    
275
                                                case "ana":
276
                                                if (flagW) {
277
                                                        writer.write("<link targets=\"#"+wordid+" #"+vAna+"\"/>\n")
278
                                                        flagAna = false;
279
                                                }
280
                                                break;
281
                                        }
282
                                        break;
283

    
284
                                case XMLStreamConstants.CHARACTERS:
285
                                        if (flagAna)
286
                                                vAna += parser.getText().trim();//catch interp text
287
                                        break;
288
                        }
289
                }
290
                writer.write("</linkGrp>\n");
291
                parser.close();
292
                inputData.close();
293
        }
294

    
295
        /**
296
         * write a group per ana type to extract from the anafile.
297
         *
298
         * @param TTrez the t trez
299
         * @param encoding the encoding
300
         * @return the java.lang. object
301
         */
302
        private writeBody(File TTrez, String encoding)
303
        {
304
                for (String type : extractTypes) {
305
                        writeGroup(type);
306
                }
307
        }
308

    
309
        /**
310
         * write the tail of the stand-off file = close body, text and TEI tags.
311
         */
312
        private void writeTail()
313
        {
314
                writer.write("</div>\n");
315
                writer.write("</body>\n");
316
                writer.write("</text>\n");
317
                writer.write("</TEI>\n");
318
        }
319

    
320
        /**
321
         * define the types to extract.
322
         *
323
         * @param types the new types to extract
324
         */
325
        public void setTypesToExtract(List<String> types)
326
        {
327
                extractTypes = types;
328
        }
329

    
330
        /**
331
         * The main method.
332
         *
333
         * @param args the arguments
334
         */
335
        public static void main(String[] args) {
336

    
337
                String rootDir = "~/xml/rgaqcj/"
338
                new File(rootDir+"/extract/").mkdir()
339

    
340
                File anafile = new File(rootDir+"/injection/","roland-ana.xml")
341
                File posfile =  new File(rootDir+"/extract/","roland-pos.xml")
342
                String encoding = "UTF-8"
343
                //extrait les txm:ana dont la ref est dans cette liste
344
                List<String> types = ["#t1","#POS"]
345

    
346
                ArrayList<String> milestones = new ArrayList<String>()
347
                //the tags you want to keep as milestones
348
                milestones.add("tagUsage")
349
                milestones.add("pb")
350
                milestones.add("lb")
351
                milestones.add("catRef")
352

    
353
                AnnotationExtraction builder = new AnnotationExtraction(anafile,posfile,encoding,milestones)
354
                builder.setTypesToExtract(types)
355
                builder.process()
356
                return
357
        }
358
}