Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / AnnotationExtraction.groovy @ 1688

History | View | Annotate | Download (9.4 kB)

1 986 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 986 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 986 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 986 mdecorde
// Sophia Antipolis, University of Paris 3.
5 986 mdecorde
//
6 986 mdecorde
// The TXM platform is free software: you can redistribute it
7 986 mdecorde
// and/or modify it under the terms of the GNU General Public
8 986 mdecorde
// License as published by the Free Software Foundation,
9 986 mdecorde
// either version 2 of the License, or (at your option) any
10 986 mdecorde
// later version.
11 986 mdecorde
//
12 986 mdecorde
// The TXM platform is distributed in the hope that it will be
13 986 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 986 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 986 mdecorde
// PURPOSE. See the GNU General Public License for more
16 986 mdecorde
// details.
17 986 mdecorde
//
18 986 mdecorde
// You should have received a copy of the GNU General
19 986 mdecorde
// Public License along with the TXM platform. If not, see
20 986 mdecorde
// http://www.gnu.org/licenses.
21 986 mdecorde
//
22 986 mdecorde
//
23 986 mdecorde
//
24 986 mdecorde
// $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
25 986 mdecorde
// $LastChangedRevision: 3400 $
26 986 mdecorde
// $LastChangedBy: mdecorde $
27 986 mdecorde
//
28 986 mdecorde
package org.txm.importer.scripts.xmltxm
29 986 mdecorde
30 986 mdecorde
import java.util.ArrayList;
31 986 mdecorde
import java.util.HashMap;
32 986 mdecorde
import javax.xml.stream.*;
33 986 mdecorde
import java.net.URL;
34 986 mdecorde
import org.txm.importer.filters.*;
35 986 mdecorde
// TODO: Auto-generated Javadoc
36 986 mdecorde
37 986 mdecorde
/**
38 986 mdecorde
 * The Class AnnotationExtraction.
39 986 mdecorde
 *
40 986 mdecorde
 * @author mdecorde
41 986 mdecorde
 *
42 986 mdecorde
 * Extract ana tags from a xml-tei-txm file
43 986 mdecorde
 * saved into a stand-off file
44 986 mdecorde
 */
45 986 mdecorde
public class AnnotationExtraction
46 986 mdecorde
{
47 986 mdecorde
        /** The input data. */
48 986 mdecorde
        private def inputData;
49 986 mdecorde
50 986 mdecorde
        /** The factory. */
51 986 mdecorde
        private def factory;
52 986 mdecorde
53 986 mdecorde
        /** The parser. */
54 986 mdecorde
        private XMLStreamReader parser;
55 986 mdecorde
56 986 mdecorde
        /** The dir. */
57 986 mdecorde
        private def dir;
58 986 mdecorde
59 986 mdecorde
        /** The writer. */
60 986 mdecorde
        private Writer writer;
61 986 mdecorde
62 986 mdecorde
        /** The url. */
63 986 mdecorde
        private def url;
64 986 mdecorde
65 986 mdecorde
        /** The posfile. */
66 986 mdecorde
        private def posfile;
67 986 mdecorde
68 986 mdecorde
        /** The anafile. */
69 986 mdecorde
        private def anafile;
70 986 mdecorde
71 986 mdecorde
        /** The encoding. */
72 986 mdecorde
        private def encoding;
73 986 mdecorde
74 986 mdecorde
        /** The extract types. */
75 986 mdecorde
        private List<String> extractTypes;
76 986 mdecorde
77 986 mdecorde
        /** The resp stmt. */
78 986 mdecorde
        private String respStmt;
79 986 mdecorde
80 986 mdecorde
        /** The solotags. */
81 986 mdecorde
        ArrayList<String> solotags;
82 986 mdecorde
83 986 mdecorde
        /**
84 986 mdecorde
         * initialize.
85 986 mdecorde
         *
86 986 mdecorde
         * @param anafile the path to the xml-tei-txm file
87 986 mdecorde
         * @param posfile the output stand-off file
88 986 mdecorde
         * @param encoding the encoding of the xml-tei-txm file
89 986 mdecorde
         * @param milestones milestones tags (temporary)
90 986 mdecorde
         */
91 986 mdecorde
        public AnnotationExtraction(File anafile, File posfile,String encoding,milestones) {
92 986 mdecorde
                this.url = anafile.toURI().toURL();
93 986 mdecorde
                this.posfile = posfile;
94 986 mdecorde
                this.encoding = encoding;
95 986 mdecorde
                this.solotags = milestones;
96 986 mdecorde
97 986 mdecorde
                inputData = url.openStream();
98 986 mdecorde
                factory = XMLInputFactory.newInstance();
99 986 mdecorde
        }
100 986 mdecorde
101 986 mdecorde
        /**
102 986 mdecorde
         * process !!!.
103 986 mdecorde
         */
104 986 mdecorde
        public void process() {
105 986 mdecorde
                assert(extractTypes != null);
106 986 mdecorde
                if (this.createOutput(posfile,encoding)) {
107 986 mdecorde
                        this.writeHead();
108 986 mdecorde
                        this.writeBody( posfile, encoding);
109 986 mdecorde
                        this.writeTail();
110 986 mdecorde
                }
111 986 mdecorde
                writer.close();
112 986 mdecorde
        }
113 986 mdecorde
114 986 mdecorde
        /**
115 986 mdecorde
         * Creates the output.
116 986 mdecorde
         *
117 986 mdecorde
         * @param outfile the outfile
118 986 mdecorde
         * @param encoding the encoding
119 986 mdecorde
         * @return true, if successful
120 986 mdecorde
         */
121 986 mdecorde
        private boolean createOutput(File outfile, String encoding){
122 986 mdecorde
                try {
123 986 mdecorde
                        writer = new OutputStreamWriter(new FileOutputStream(outfile) , "UTF-8");
124 986 mdecorde
                        return true;
125 986 mdecorde
                } catch (Exception e) {
126 986 mdecorde
                        System.out.println(e.getLocalizedMessage());
127 986 mdecorde
                        return false;
128 986 mdecorde
                }
129 986 mdecorde
        }
130 986 mdecorde
131 986 mdecorde
        /**
132 986 mdecorde
         * set the respStmt tag of the stand-off file by reading the respstmt of the xml-tei-txm file.
133 986 mdecorde
         */
134 986 mdecorde
        private void setResp()
135 986 mdecorde
        {
136 986 mdecorde
                this.respStmt = "";
137 986 mdecorde
                boolean begincapture = false;
138 986 mdecorde
                String lastopenlocalname= "";
139 986 mdecorde
140 986 mdecorde
                inputData = url.openStream();
141 986 mdecorde
                parser = factory.createXMLStreamReader(inputData);
142 986 mdecorde
143 986 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
144 986 mdecorde
                {
145 986 mdecorde
                        String prefix = parser.getPrefix();
146 986 mdecorde
                        if (prefix == null || prefix == "")
147 986 mdecorde
                                prefix = "";
148 986 mdecorde
                        else
149 986 mdecorde
                                prefix +=":";
150 986 mdecorde
                        if (event == XMLStreamConstants.START_ELEMENT) {
151 986 mdecorde
                                if (parser.getLocalName().matches("resp")) {
152 986 mdecorde
                                        String id = parser.getAttributeValue("xml","id")
153 986 mdecorde
                                        if (id == null)
154 986 mdecorde
                                                id = parser.getAttributeValue(null,"id")
155 986 mdecorde
                                        if (extractTypes.contains(id)) {
156 986 mdecorde
                                                this.respStmt = "<respStmt>\n";
157 986 mdecorde
                                                begincapture= true;
158 986 mdecorde
                                        }
159 986 mdecorde
                                }
160 986 mdecorde
                                if (begincapture) {
161 986 mdecorde
                                        lastopenlocalname = parser.getLocalName();
162 986 mdecorde
                                        respStmt += ("\n<"+prefix+parser.getLocalName());
163 986 mdecorde
                                        for (int i= 0 ; i < parser.getAttributeCount() ;i++ )
164 986 mdecorde
                                                respStmt += (" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"" );
165 986 mdecorde
                                        if (solotags.contains(lastopenlocalname))
166 986 mdecorde
                                                respStmt += ("/>");
167 986 mdecorde
                                        else
168 986 mdecorde
                                                respStmt += (">");
169 986 mdecorde
                                }
170 986 mdecorde
                        }
171 986 mdecorde
                        else if (event == XMLStreamConstants.END_ELEMENT) {
172 986 mdecorde
                                if (parser.getLocalName().matches("respStmt")) {
173 986 mdecorde
                                        begincapture= false;
174 986 mdecorde
                                        this.respStmt += "\n</respStmt>"
175 986 mdecorde
                                        System.out.println(this.respStmt);
176 986 mdecorde
                                        parser.close();
177 1688 mdecorde
                                        inputData.close()
178 986 mdecorde
                                        return;
179 986 mdecorde
                                } else {
180 986 mdecorde
                                        if (!solotags.contains(parser.getLocalName()))
181 986 mdecorde
                                                if (lastopenlocalname.equals(parser.getLocalName()))
182 986 mdecorde
                                                        respStmt += ("</"+prefix+parser.getLocalName()+">");
183 986 mdecorde
                                                else
184 986 mdecorde
                                                        respStmt += ("\n</"+prefix+parser.getLocalName()+">");
185 986 mdecorde
                                }
186 986 mdecorde
                        } else if (event == XMLStreamConstants.CHARACTERS) {
187 986 mdecorde
                                String txt = parser.getText().trim();
188 986 mdecorde
                                respStmt += txt;
189 986 mdecorde
                        }
190 986 mdecorde
                }
191 1688 mdecorde
                parser.close();
192 1688 mdecorde
                inputData.close()
193 1688 mdecorde
                return;
194 986 mdecorde
        }
195 986 mdecorde
196 986 mdecorde
        /**
197 986 mdecorde
         * write the header of the stand-off ile.
198 986 mdecorde
         */
199 986 mdecorde
        private void writeHead() {
200 986 mdecorde
                this.setResp();
201 986 mdecorde
202 986 mdecorde
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
203 986 mdecorde
                writer.write("<!DOCTYPE TEI SYSTEM \"tei_bfm_v1.dtd\">\n");
204 986 mdecorde
                writer.write("<TEI xmlns:txm=\"http://textometrie.ens-lyon.fr/1.0\">\n");
205 986 mdecorde
                writer.write("<teiHeader xml:lang=\"eng\">\n");
206 986 mdecorde
                writer.write("<fileDesc>\n");
207 986 mdecorde
                writer.write("<titleStmt>\n");
208 986 mdecorde
                for (int i=0;i < extractTypes.size(); i++)
209 986 mdecorde
                        writer.write("<title>"+extractTypes[i]+"</title>\n");
210 986 mdecorde
                writer.write(this.respStmt+"\n");
211 986 mdecorde
                writer.write("</titleStmt>\n");
212 986 mdecorde
                writer.write("<publicationStmt>\n");
213 986 mdecorde
                writer.write("<distributor>BFM project - http://bfm.ens-lsh.fr</distributor>\n");
214 986 mdecorde
                writer.write("<availability>\n");
215 986 mdecorde
                writer.write("<p>(c) 2010 Projet BFM - CNRS/ENS-LSH.\n");
216 986 mdecorde
                writer.write("<hi>Conditions d'utilisation</hi> : \n");
217 986 mdecorde
                writer.write("Sous licence <ref target=\"http://creativecommons.org/licenses/by-sa/2.0/fr/\">Creative Commons</ref>.\n");
218 986 mdecorde
                writer.write("</p>\n");
219 986 mdecorde
                writer.write("</availability>\n");
220 986 mdecorde
                writer.write("</publicationStmt>\n");
221 986 mdecorde
                writer.write("<sourceDesc>\n");
222 986 mdecorde
                writer.write("<p>born digital : TXM project - http://textometrie.org</p>\n");
223 986 mdecorde
                writer.write("</sourceDesc>\n");
224 986 mdecorde
                writer.write("</fileDesc>\n");
225 986 mdecorde
                writer.write("</teiHeader>\n");
226 986 mdecorde
                writer.write("<text xml:lang=\"fr\" type=\"standoff\">\n");
227 986 mdecorde
                writer.write("<body>\n");
228 986 mdecorde
                writer.write("<div>\n");
229 986 mdecorde
        }
230 986 mdecorde
231 986 mdecorde
        /**
232 986 mdecorde
         * write the group of ana tags with the attribute type = type.
233 986 mdecorde
         *
234 986 mdecorde
         * @param type the type
235 986 mdecorde
         */
236 986 mdecorde
        private void writeGroup(String type)
237 986 mdecorde
        {
238 986 mdecorde
                System.out.println("write group : "+type);
239 986 mdecorde
                inputData = url.openStream();
240 986 mdecorde
                factory = XMLInputFactory.newInstance();
241 986 mdecorde
                parser = factory.createXMLStreamReader(inputData);
242 986 mdecorde
243 986 mdecorde
                boolean flagAna = false;
244 986 mdecorde
                boolean flagForm = false;
245 986 mdecorde
                boolean flagW = false;
246 986 mdecorde
                String wordid="";
247 986 mdecorde
                String vAna="";
248 986 mdecorde
                String currentType;
249 986 mdecorde
250 986 mdecorde
                writer.write("<linkGrp type=\""+type+"\">\n")
251 986 mdecorde
252 986 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next())
253 986 mdecorde
                {
254 986 mdecorde
                        switch (event) {
255 986 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
256 986 mdecorde
                                        switch (parser.getLocalName()) {
257 986 mdecorde
                                                case "w":
258 986 mdecorde
                                                wordid = parser.getAttributeValue(null,"id");
259 986 mdecorde
                                                flagW = true;
260 986 mdecorde
                                                break;
261 986 mdecorde
                                                case "ana":
262 986 mdecorde
                                                if (flagW) {
263 986 mdecorde
                                                        currentType = (parser.getAttributeValue(null,"type"));
264 986 mdecorde
                                                        if (currentType.matches(type)) {
265 986 mdecorde
                                                                flagAna = true;
266 986 mdecorde
                                                                vAna ="";
267 986 mdecorde
                                                        }
268 986 mdecorde
                                                }
269 986 mdecorde
                                                break;
270 986 mdecorde
                                        }
271 986 mdecorde
                                        break;
272 986 mdecorde
273 986 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
274 986 mdecorde
                                        switch (parser.getLocalName()) {
275 986 mdecorde
                                                case "w":
276 986 mdecorde
                                                flagW = false;
277 986 mdecorde
                                                break;
278 986 mdecorde
279 986 mdecorde
                                                case "ana":
280 986 mdecorde
                                                if (flagW) {
281 986 mdecorde
                                                        writer.write("<link targets=\"#"+wordid+" #"+vAna+"\"/>\n")
282 986 mdecorde
                                                        flagAna = false;
283 986 mdecorde
                                                }
284 986 mdecorde
                                                break;
285 986 mdecorde
                                        }
286 986 mdecorde
                                        break;
287 986 mdecorde
288 986 mdecorde
                                case XMLStreamConstants.CHARACTERS:
289 986 mdecorde
                                        if (flagAna)
290 986 mdecorde
                                                vAna += parser.getText().trim();//catch interp text
291 986 mdecorde
                                        break;
292 986 mdecorde
                        }
293 986 mdecorde
                }
294 986 mdecorde
                writer.write("</linkGrp>\n");
295 986 mdecorde
                parser.close();
296 986 mdecorde
                inputData.close();
297 986 mdecorde
        }
298 986 mdecorde
299 986 mdecorde
        /**
300 986 mdecorde
         * write a group per ana type to extract from the anafile.
301 986 mdecorde
         *
302 986 mdecorde
         * @param TTrez the t trez
303 986 mdecorde
         * @param encoding the encoding
304 986 mdecorde
         * @return the java.lang. object
305 986 mdecorde
         */
306 986 mdecorde
        private writeBody(File TTrez, String encoding)
307 986 mdecorde
        {
308 986 mdecorde
                for (String type : extractTypes) {
309 986 mdecorde
                        writeGroup(type);
310 986 mdecorde
                }
311 986 mdecorde
        }
312 986 mdecorde
313 986 mdecorde
        /**
314 986 mdecorde
         * write the tail of the stand-off file = close body, text and TEI tags.
315 986 mdecorde
         */
316 986 mdecorde
        private void writeTail()
317 986 mdecorde
        {
318 986 mdecorde
                writer.write("</div>\n");
319 986 mdecorde
                writer.write("</body>\n");
320 986 mdecorde
                writer.write("</text>\n");
321 986 mdecorde
                writer.write("</TEI>\n");
322 986 mdecorde
        }
323 986 mdecorde
324 986 mdecorde
        /**
325 986 mdecorde
         * define the types to extract.
326 986 mdecorde
         *
327 986 mdecorde
         * @param types the new types to extract
328 986 mdecorde
         */
329 986 mdecorde
        public void setTypesToExtract(List<String> types)
330 986 mdecorde
        {
331 986 mdecorde
                extractTypes = types;
332 986 mdecorde
        }
333 986 mdecorde
334 986 mdecorde
        /**
335 986 mdecorde
         * The main method.
336 986 mdecorde
         *
337 986 mdecorde
         * @param args the arguments
338 986 mdecorde
         */
339 986 mdecorde
        public static void main(String[] args) {
340 986 mdecorde
341 986 mdecorde
                String rootDir = "~/xml/rgaqcj/"
342 986 mdecorde
                new File(rootDir+"/extract/").mkdir()
343 986 mdecorde
344 986 mdecorde
                File anafile = new File(rootDir+"/injection/","roland-ana.xml")
345 986 mdecorde
                File posfile =  new File(rootDir+"/extract/","roland-pos.xml")
346 986 mdecorde
                String encoding = "UTF-8"
347 986 mdecorde
                //extrait les txm:ana dont la ref est dans cette liste
348 986 mdecorde
                List<String> types = ["#t1","#POS"]
349 986 mdecorde
350 986 mdecorde
                ArrayList<String> milestones = new ArrayList<String>()
351 986 mdecorde
                //the tags you want to keep as milestones
352 986 mdecorde
                milestones.add("tagUsage")
353 986 mdecorde
                milestones.add("pb")
354 986 mdecorde
                milestones.add("lb")
355 986 mdecorde
                milestones.add("catRef")
356 986 mdecorde
357 986 mdecorde
                AnnotationExtraction builder = new AnnotationExtraction(anafile,posfile,encoding,milestones)
358 986 mdecorde
                builder.setTypesToExtract(types)
359 986 mdecorde
                builder.process()
360 986 mdecorde
                return
361 986 mdecorde
        }
362 986 mdecorde
}