root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / AnnotationExtraction.groovy @ 1688
History | View | Annotate | Download (9.4 kB)
1 | 986 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 986 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 986 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 986 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 986 | mdecorde | //
|
6 | 986 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 986 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 986 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 986 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 986 | mdecorde | // later version.
|
11 | 986 | mdecorde | //
|
12 | 986 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 986 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 986 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 986 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 986 | mdecorde | // details.
|
17 | 986 | mdecorde | //
|
18 | 986 | mdecorde | // You should have received a copy of the GNU General
|
19 | 986 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 986 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 986 | mdecorde | //
|
22 | 986 | mdecorde | //
|
23 | 986 | mdecorde | //
|
24 | 986 | mdecorde | // $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
|
25 | 986 | mdecorde | // $LastChangedRevision: 3400 $
|
26 | 986 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 986 | mdecorde | //
|
28 | 986 | mdecorde | package org.txm.importer.scripts.xmltxm
|
29 | 986 | mdecorde | |
30 | 986 | mdecorde | import java.util.ArrayList; |
31 | 986 | mdecorde | import java.util.HashMap; |
32 | 986 | mdecorde | import javax.xml.stream.*; |
33 | 986 | mdecorde | import java.net.URL; |
34 | 986 | mdecorde | import org.txm.importer.filters.*; |
35 | 986 | mdecorde | // TODO: Auto-generated Javadoc
|
36 | 986 | mdecorde | |
37 | 986 | mdecorde | /**
|
38 | 986 | mdecorde | * The Class AnnotationExtraction.
|
39 | 986 | mdecorde | *
|
40 | 986 | mdecorde | * @author mdecorde
|
41 | 986 | mdecorde | *
|
42 | 986 | mdecorde | * Extract ana tags from a xml-tei-txm file
|
43 | 986 | mdecorde | * saved into a stand-off file
|
44 | 986 | mdecorde | */
|
45 | 986 | mdecorde | public class AnnotationExtraction |
46 | 986 | mdecorde | { |
47 | 986 | mdecorde | /** The input data. */
|
48 | 986 | mdecorde | private def inputData; |
49 | 986 | mdecorde | |
50 | 986 | mdecorde | /** The factory. */
|
51 | 986 | mdecorde | private def factory; |
52 | 986 | mdecorde | |
53 | 986 | mdecorde | /** The parser. */
|
54 | 986 | mdecorde | private XMLStreamReader parser;
|
55 | 986 | mdecorde | |
56 | 986 | mdecorde | /** The dir. */
|
57 | 986 | mdecorde | private def dir; |
58 | 986 | mdecorde | |
59 | 986 | mdecorde | /** The writer. */
|
60 | 986 | mdecorde | private Writer writer; |
61 | 986 | mdecorde | |
62 | 986 | mdecorde | /** The url. */
|
63 | 986 | mdecorde | private def url; |
64 | 986 | mdecorde | |
65 | 986 | mdecorde | /** The posfile. */
|
66 | 986 | mdecorde | private def posfile; |
67 | 986 | mdecorde | |
68 | 986 | mdecorde | /** The anafile. */
|
69 | 986 | mdecorde | private def anafile; |
70 | 986 | mdecorde | |
71 | 986 | mdecorde | /** The encoding. */
|
72 | 986 | mdecorde | private def encoding; |
73 | 986 | mdecorde | |
74 | 986 | mdecorde | /** The extract types. */
|
75 | 986 | mdecorde | private List<String> extractTypes; |
76 | 986 | mdecorde | |
77 | 986 | mdecorde | /** The resp stmt. */
|
78 | 986 | mdecorde | private String respStmt; |
79 | 986 | mdecorde | |
80 | 986 | mdecorde | /** The solotags. */
|
81 | 986 | mdecorde | ArrayList<String> solotags; |
82 | 986 | mdecorde | |
83 | 986 | mdecorde | /**
|
84 | 986 | mdecorde | * initialize.
|
85 | 986 | mdecorde | *
|
86 | 986 | mdecorde | * @param anafile the path to the xml-tei-txm file
|
87 | 986 | mdecorde | * @param posfile the output stand-off file
|
88 | 986 | mdecorde | * @param encoding the encoding of the xml-tei-txm file
|
89 | 986 | mdecorde | * @param milestones milestones tags (temporary)
|
90 | 986 | mdecorde | */
|
91 | 986 | mdecorde | public AnnotationExtraction(File anafile, File posfile,String encoding,milestones) { |
92 | 986 | mdecorde | this.url = anafile.toURI().toURL();
|
93 | 986 | mdecorde | this.posfile = posfile;
|
94 | 986 | mdecorde | this.encoding = encoding;
|
95 | 986 | mdecorde | this.solotags = milestones;
|
96 | 986 | mdecorde | |
97 | 986 | mdecorde | inputData = url.openStream(); |
98 | 986 | mdecorde | factory = XMLInputFactory.newInstance(); |
99 | 986 | mdecorde | } |
100 | 986 | mdecorde | |
101 | 986 | mdecorde | /**
|
102 | 986 | mdecorde | * process !!!.
|
103 | 986 | mdecorde | */
|
104 | 986 | mdecorde | public void process() { |
105 | 986 | mdecorde | assert(extractTypes != null); |
106 | 986 | mdecorde | if (this.createOutput(posfile,encoding)) { |
107 | 986 | mdecorde | this.writeHead();
|
108 | 986 | mdecorde | this.writeBody( posfile, encoding);
|
109 | 986 | mdecorde | this.writeTail();
|
110 | 986 | mdecorde | } |
111 | 986 | mdecorde | writer.close(); |
112 | 986 | mdecorde | } |
113 | 986 | mdecorde | |
114 | 986 | mdecorde | /**
|
115 | 986 | mdecorde | * Creates the output.
|
116 | 986 | mdecorde | *
|
117 | 986 | mdecorde | * @param outfile the outfile
|
118 | 986 | mdecorde | * @param encoding the encoding
|
119 | 986 | mdecorde | * @return true, if successful
|
120 | 986 | mdecorde | */
|
121 | 986 | mdecorde | private boolean createOutput(File outfile, String encoding){ |
122 | 986 | mdecorde | try {
|
123 | 986 | mdecorde | writer = new OutputStreamWriter(new FileOutputStream(outfile) , "UTF-8"); |
124 | 986 | mdecorde | return true; |
125 | 986 | mdecorde | } catch (Exception e) { |
126 | 986 | mdecorde | System.out.println(e.getLocalizedMessage());
|
127 | 986 | mdecorde | return false; |
128 | 986 | mdecorde | } |
129 | 986 | mdecorde | } |
130 | 986 | mdecorde | |
131 | 986 | mdecorde | /**
|
132 | 986 | mdecorde | * set the respStmt tag of the stand-off file by reading the respstmt of the xml-tei-txm file.
|
133 | 986 | mdecorde | */
|
134 | 986 | mdecorde | private void setResp() |
135 | 986 | mdecorde | { |
136 | 986 | mdecorde | this.respStmt = ""; |
137 | 986 | mdecorde | boolean begincapture = false; |
138 | 986 | mdecorde | String lastopenlocalname= ""; |
139 | 986 | mdecorde | |
140 | 986 | mdecorde | inputData = url.openStream(); |
141 | 986 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
142 | 986 | mdecorde | |
143 | 986 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
144 | 986 | mdecorde | { |
145 | 986 | mdecorde | String prefix = parser.getPrefix();
|
146 | 986 | mdecorde | if (prefix == null || prefix == "") |
147 | 986 | mdecorde | prefix = "";
|
148 | 986 | mdecorde | else
|
149 | 986 | mdecorde | prefix +=":";
|
150 | 986 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) {
|
151 | 986 | mdecorde | if (parser.getLocalName().matches("resp")) { |
152 | 986 | mdecorde | String id = parser.getAttributeValue("xml","id") |
153 | 986 | mdecorde | if (id == null) |
154 | 986 | mdecorde | id = parser.getAttributeValue(null,"id") |
155 | 986 | mdecorde | if (extractTypes.contains(id)) {
|
156 | 986 | mdecorde | this.respStmt = "<respStmt>\n"; |
157 | 986 | mdecorde | begincapture= true;
|
158 | 986 | mdecorde | } |
159 | 986 | mdecorde | } |
160 | 986 | mdecorde | if (begincapture) {
|
161 | 986 | mdecorde | lastopenlocalname = parser.getLocalName(); |
162 | 986 | mdecorde | respStmt += ("\n<"+prefix+parser.getLocalName());
|
163 | 986 | mdecorde | for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
164 | 986 | mdecorde | respStmt += (" "+parser.getAttributeLocalName(i)+"=\""+parser.getAttributeValue(i)+"\"" ); |
165 | 986 | mdecorde | if (solotags.contains(lastopenlocalname))
|
166 | 986 | mdecorde | respStmt += ("/>");
|
167 | 986 | mdecorde | else
|
168 | 986 | mdecorde | respStmt += (">");
|
169 | 986 | mdecorde | } |
170 | 986 | mdecorde | } |
171 | 986 | mdecorde | else if (event == XMLStreamConstants.END_ELEMENT) { |
172 | 986 | mdecorde | if (parser.getLocalName().matches("respStmt")) { |
173 | 986 | mdecorde | begincapture= false;
|
174 | 986 | mdecorde | this.respStmt += "\n</respStmt>" |
175 | 986 | mdecorde | System.out.println(this.respStmt); |
176 | 986 | mdecorde | parser.close(); |
177 | 1688 | mdecorde | inputData.close() |
178 | 986 | mdecorde | return;
|
179 | 986 | mdecorde | } else {
|
180 | 986 | mdecorde | if (!solotags.contains(parser.getLocalName()))
|
181 | 986 | mdecorde | if (lastopenlocalname.equals(parser.getLocalName()))
|
182 | 986 | mdecorde | respStmt += ("</"+prefix+parser.getLocalName()+">"); |
183 | 986 | mdecorde | else
|
184 | 986 | mdecorde | respStmt += ("\n</"+prefix+parser.getLocalName()+">"); |
185 | 986 | mdecorde | } |
186 | 986 | mdecorde | } else if (event == XMLStreamConstants.CHARACTERS) { |
187 | 986 | mdecorde | String txt = parser.getText().trim();
|
188 | 986 | mdecorde | respStmt += txt; |
189 | 986 | mdecorde | } |
190 | 986 | mdecorde | } |
191 | 1688 | mdecorde | parser.close(); |
192 | 1688 | mdecorde | inputData.close() |
193 | 1688 | mdecorde | return;
|
194 | 986 | mdecorde | } |
195 | 986 | mdecorde | |
196 | 986 | mdecorde | /**
|
197 | 986 | mdecorde | * write the header of the stand-off ile.
|
198 | 986 | mdecorde | */
|
199 | 986 | mdecorde | private void writeHead() { |
200 | 986 | mdecorde | this.setResp();
|
201 | 986 | mdecorde | |
202 | 986 | mdecorde | writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
|
203 | 986 | mdecorde | writer.write("<!DOCTYPE TEI SYSTEM \"tei_bfm_v1.dtd\">\n");
|
204 | 986 | mdecorde | writer.write("<TEI xmlns:txm=\"http://textometrie.ens-lyon.fr/1.0\">\n");
|
205 | 986 | mdecorde | writer.write("<teiHeader xml:lang=\"eng\">\n");
|
206 | 986 | mdecorde | writer.write("<fileDesc>\n");
|
207 | 986 | mdecorde | writer.write("<titleStmt>\n");
|
208 | 986 | mdecorde | for (int i=0;i < extractTypes.size(); i++) |
209 | 986 | mdecorde | writer.write("<title>"+extractTypes[i]+"</title>\n"); |
210 | 986 | mdecorde | writer.write(this.respStmt+"\n"); |
211 | 986 | mdecorde | writer.write("</titleStmt>\n");
|
212 | 986 | mdecorde | writer.write("<publicationStmt>\n");
|
213 | 986 | mdecorde | writer.write("<distributor>BFM project - http://bfm.ens-lsh.fr</distributor>\n");
|
214 | 986 | mdecorde | writer.write("<availability>\n");
|
215 | 986 | mdecorde | writer.write("<p>(c) 2010 Projet BFM - CNRS/ENS-LSH.\n");
|
216 | 986 | mdecorde | writer.write("<hi>Conditions d'utilisation</hi> : \n");
|
217 | 986 | mdecorde | writer.write("Sous licence <ref target=\"http://creativecommons.org/licenses/by-sa/2.0/fr/\">Creative Commons</ref>.\n");
|
218 | 986 | mdecorde | writer.write("</p>\n");
|
219 | 986 | mdecorde | writer.write("</availability>\n");
|
220 | 986 | mdecorde | writer.write("</publicationStmt>\n");
|
221 | 986 | mdecorde | writer.write("<sourceDesc>\n");
|
222 | 986 | mdecorde | writer.write("<p>born digital : TXM project - http://textometrie.org</p>\n");
|
223 | 986 | mdecorde | writer.write("</sourceDesc>\n");
|
224 | 986 | mdecorde | writer.write("</fileDesc>\n");
|
225 | 986 | mdecorde | writer.write("</teiHeader>\n");
|
226 | 986 | mdecorde | writer.write("<text xml:lang=\"fr\" type=\"standoff\">\n");
|
227 | 986 | mdecorde | writer.write("<body>\n");
|
228 | 986 | mdecorde | writer.write("<div>\n");
|
229 | 986 | mdecorde | } |
230 | 986 | mdecorde | |
231 | 986 | mdecorde | /**
|
232 | 986 | mdecorde | * write the group of ana tags with the attribute type = type.
|
233 | 986 | mdecorde | *
|
234 | 986 | mdecorde | * @param type the type
|
235 | 986 | mdecorde | */
|
236 | 986 | mdecorde | private void writeGroup(String type) |
237 | 986 | mdecorde | { |
238 | 986 | mdecorde | System.out.println("write group : "+type); |
239 | 986 | mdecorde | inputData = url.openStream(); |
240 | 986 | mdecorde | factory = XMLInputFactory.newInstance(); |
241 | 986 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
242 | 986 | mdecorde | |
243 | 986 | mdecorde | boolean flagAna = false; |
244 | 986 | mdecorde | boolean flagForm = false; |
245 | 986 | mdecorde | boolean flagW = false; |
246 | 986 | mdecorde | String wordid=""; |
247 | 986 | mdecorde | String vAna=""; |
248 | 986 | mdecorde | String currentType;
|
249 | 986 | mdecorde | |
250 | 986 | mdecorde | writer.write("<linkGrp type=\""+type+"\">\n") |
251 | 986 | mdecorde | |
252 | 986 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
253 | 986 | mdecorde | { |
254 | 986 | mdecorde | switch (event) {
|
255 | 986 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
256 | 986 | mdecorde | switch (parser.getLocalName()) {
|
257 | 986 | mdecorde | case "w": |
258 | 986 | mdecorde | wordid = parser.getAttributeValue(null,"id"); |
259 | 986 | mdecorde | flagW = true;
|
260 | 986 | mdecorde | break;
|
261 | 986 | mdecorde | case "ana": |
262 | 986 | mdecorde | if (flagW) {
|
263 | 986 | mdecorde | currentType = (parser.getAttributeValue(null,"type")); |
264 | 986 | mdecorde | if (currentType.matches(type)) {
|
265 | 986 | mdecorde | flagAna = true;
|
266 | 986 | mdecorde | vAna ="";
|
267 | 986 | mdecorde | } |
268 | 986 | mdecorde | } |
269 | 986 | mdecorde | break;
|
270 | 986 | mdecorde | } |
271 | 986 | mdecorde | break;
|
272 | 986 | mdecorde | |
273 | 986 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
274 | 986 | mdecorde | switch (parser.getLocalName()) {
|
275 | 986 | mdecorde | case "w": |
276 | 986 | mdecorde | flagW = false;
|
277 | 986 | mdecorde | break;
|
278 | 986 | mdecorde | |
279 | 986 | mdecorde | case "ana": |
280 | 986 | mdecorde | if (flagW) {
|
281 | 986 | mdecorde | writer.write("<link targets=\"#"+wordid+" #"+vAna+"\"/>\n") |
282 | 986 | mdecorde | flagAna = false;
|
283 | 986 | mdecorde | } |
284 | 986 | mdecorde | break;
|
285 | 986 | mdecorde | } |
286 | 986 | mdecorde | break;
|
287 | 986 | mdecorde | |
288 | 986 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
289 | 986 | mdecorde | if (flagAna)
|
290 | 986 | mdecorde | vAna += parser.getText().trim();//catch interp text
|
291 | 986 | mdecorde | break;
|
292 | 986 | mdecorde | } |
293 | 986 | mdecorde | } |
294 | 986 | mdecorde | writer.write("</linkGrp>\n");
|
295 | 986 | mdecorde | parser.close(); |
296 | 986 | mdecorde | inputData.close(); |
297 | 986 | mdecorde | } |
298 | 986 | mdecorde | |
299 | 986 | mdecorde | /**
|
300 | 986 | mdecorde | * write a group per ana type to extract from the anafile.
|
301 | 986 | mdecorde | *
|
302 | 986 | mdecorde | * @param TTrez the t trez
|
303 | 986 | mdecorde | * @param encoding the encoding
|
304 | 986 | mdecorde | * @return the java.lang. object
|
305 | 986 | mdecorde | */
|
306 | 986 | mdecorde | private writeBody(File TTrez, String encoding) |
307 | 986 | mdecorde | { |
308 | 986 | mdecorde | for (String type : extractTypes) { |
309 | 986 | mdecorde | writeGroup(type); |
310 | 986 | mdecorde | } |
311 | 986 | mdecorde | } |
312 | 986 | mdecorde | |
313 | 986 | mdecorde | /**
|
314 | 986 | mdecorde | * write the tail of the stand-off file = close body, text and TEI tags.
|
315 | 986 | mdecorde | */
|
316 | 986 | mdecorde | private void writeTail() |
317 | 986 | mdecorde | { |
318 | 986 | mdecorde | writer.write("</div>\n");
|
319 | 986 | mdecorde | writer.write("</body>\n");
|
320 | 986 | mdecorde | writer.write("</text>\n");
|
321 | 986 | mdecorde | writer.write("</TEI>\n");
|
322 | 986 | mdecorde | } |
323 | 986 | mdecorde | |
324 | 986 | mdecorde | /**
|
325 | 986 | mdecorde | * define the types to extract.
|
326 | 986 | mdecorde | *
|
327 | 986 | mdecorde | * @param types the new types to extract
|
328 | 986 | mdecorde | */
|
329 | 986 | mdecorde | public void setTypesToExtract(List<String> types) |
330 | 986 | mdecorde | { |
331 | 986 | mdecorde | extractTypes = types; |
332 | 986 | mdecorde | } |
333 | 986 | mdecorde | |
334 | 986 | mdecorde | /**
|
335 | 986 | mdecorde | * The main method.
|
336 | 986 | mdecorde | *
|
337 | 986 | mdecorde | * @param args the arguments
|
338 | 986 | mdecorde | */
|
339 | 986 | mdecorde | public static void main(String[] args) { |
340 | 986 | mdecorde | |
341 | 986 | mdecorde | String rootDir = "~/xml/rgaqcj/" |
342 | 986 | mdecorde | new File(rootDir+"/extract/").mkdir() |
343 | 986 | mdecorde | |
344 | 986 | mdecorde | File anafile = new File(rootDir+"/injection/","roland-ana.xml") |
345 | 986 | mdecorde | File posfile = new File(rootDir+"/extract/","roland-pos.xml") |
346 | 986 | mdecorde | String encoding = "UTF-8" |
347 | 986 | mdecorde | //extrait les txm:ana dont la ref est dans cette liste
|
348 | 986 | mdecorde | List<String> types = ["#t1","#POS"] |
349 | 986 | mdecorde | |
350 | 986 | mdecorde | ArrayList<String> milestones = new ArrayList<String>() |
351 | 986 | mdecorde | //the tags you want to keep as milestones
|
352 | 986 | mdecorde | milestones.add("tagUsage")
|
353 | 986 | mdecorde | milestones.add("pb")
|
354 | 986 | mdecorde | milestones.add("lb")
|
355 | 986 | mdecorde | milestones.add("catRef")
|
356 | 986 | mdecorde | |
357 | 986 | mdecorde | AnnotationExtraction builder = new AnnotationExtraction(anafile,posfile,encoding,milestones)
|
358 | 986 | mdecorde | builder.setTypesToExtract(types) |
359 | 986 | mdecorde | builder.process() |
360 | 986 | mdecorde | return
|
361 | 986 | mdecorde | } |
362 | 986 | mdecorde | } |