root / tmp / org.txm.core / src / java / org / txm / scripts / importer / XMLTXM2WTC.groovy @ 2473
History | View | Annotate | Download (13 kB)
1 | 881 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 881 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 881 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 881 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 1094 | mdecorde | //
|
6 | 881 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 881 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 881 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 881 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 881 | mdecorde | // later version.
|
11 | 1094 | mdecorde | //
|
12 | 881 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 881 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 881 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 881 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 881 | mdecorde | // details.
|
17 | 1094 | mdecorde | //
|
18 | 881 | mdecorde | // You should have received a copy of the GNU General
|
19 | 881 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 881 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 1094 | mdecorde | //
|
22 | 1094 | mdecorde | //
|
23 | 1094 | mdecorde | //
|
24 | 881 | mdecorde | // $LastChangedDate: 2017-04-11 15:30:35 +0200 (mar. 11 avril 2017) $
|
25 | 881 | mdecorde | // $LastChangedRevision: 3426 $
|
26 | 1094 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 881 | mdecorde | //
|
28 | 1000 | mdecorde | package org.txm.scripts.importer
|
29 | 881 | mdecorde | |
30 | 881 | mdecorde | import java.text.DateFormat; |
31 | 881 | mdecorde | import java.util.Date; |
32 | 881 | mdecorde | import java.util.ArrayList; |
33 | 881 | mdecorde | import java.util.HashMap; |
34 | 881 | mdecorde | import java.util.LinkedHashMap; |
35 | 881 | mdecorde | import javax.xml.stream.*; |
36 | 881 | mdecorde | import java.net.URL; |
37 | 881 | mdecorde | import org.txm.importer.filters.*; |
38 | 881 | mdecorde | // TODO: Auto-generated Javadoc
|
39 | 881 | mdecorde | |
40 | 881 | mdecorde | /**
|
41 | 881 | mdecorde | * The Class XMLTXM2CQP.
|
42 | 881 | mdecorde | *
|
43 | 881 | mdecorde | * @author mdecorde
|
44 | 881 | mdecorde | * simple transofmration of a xml-tei-txm file into cqp file
|
45 | 881 | mdecorde | */
|
46 | 881 | mdecorde | |
47 | 881 | mdecorde | class XMLTXM2CQP |
48 | 881 | mdecorde | { |
49 | 1094 | mdecorde | |
50 | 881 | mdecorde | /** The url. */
|
51 | 881 | mdecorde | private def url; |
52 | 1094 | mdecorde | |
53 | 881 | mdecorde | /** The input data. */
|
54 | 881 | mdecorde | private def inputData; |
55 | 1094 | mdecorde | |
56 | 881 | mdecorde | /** The factory. */
|
57 | 881 | mdecorde | private def factory; |
58 | 1094 | mdecorde | |
59 | 881 | mdecorde | /** The parser. */
|
60 | 881 | mdecorde | private XMLStreamReader parser;
|
61 | 1094 | mdecorde | |
62 | 881 | mdecorde | /** The output. */
|
63 | 881 | mdecorde | private def output; |
64 | 1094 | mdecorde | |
65 | 881 | mdecorde | /** The hashmap of txm:form and txm:ana values and the attributes hash*/
|
66 | 881 | mdecorde | LinkedHashMap<String, String> anahash = new LinkedHashMap<String, String>(); |
67 | 881 | mdecorde | LinkedHashMap<String, String> formhash = new LinkedHashMap<String, String>(); |
68 | 881 | mdecorde | LinkedHashMap<String, String> wordattributes = new LinkedHashMap<String, String>(); |
69 | 1094 | mdecorde | |
70 | 881 | mdecorde | /** The balisesfound. */
|
71 | 881 | mdecorde | HashMap<String, List<String>> balisesfound;// = new HashMap<String, List<String>>(); |
72 | 881 | mdecorde | |
73 | 881 | mdecorde | /** The balises to keep. */
|
74 | 881 | mdecorde | List<String> balisesToKeep; |
75 | 1094 | mdecorde | |
76 | 881 | mdecorde | /** The send to p attributes. */
|
77 | 881 | mdecorde | HashMap <String, List<String>> sendToPAttributes;// = new HashMap<String, List<String>>(); |
78 | 1094 | mdecorde | |
79 | 881 | mdecorde | /** The injected p attributes. */
|
80 | 881 | mdecorde | List<String> injectedPAttributes = new ArrayList<String>(); |
81 | 1094 | mdecorde | |
82 | 881 | mdecorde | /** The default reference : a pattern + the properties to use */
|
83 | 881 | mdecorde | List<String> defaultReferences = new ArrayList<String>(); |
84 | 881 | mdecorde | String defaultReferencePattern;
|
85 | 1094 | mdecorde | |
86 | 881 | mdecorde | /** The injected p attributes values. */
|
87 | 881 | mdecorde | HashMap <String, String> injectedPAttributesValues;// = new ArrayList<String>(); |
88 | 1094 | mdecorde | |
89 | 881 | mdecorde | /** The addinfos. */
|
90 | 881 | mdecorde | boolean addinfos = false; |
91 | 1094 | mdecorde | |
92 | 881 | mdecorde | /** The txtname. */
|
93 | 881 | mdecorde | String txtname;
|
94 | 1094 | mdecorde | |
95 | 881 | mdecorde | /** The base. */
|
96 | 881 | mdecorde | String base;
|
97 | 1094 | mdecorde | |
98 | 881 | mdecorde | /** The project. */
|
99 | 881 | mdecorde | String project;
|
100 | 1094 | mdecorde | |
101 | 881 | mdecorde | /** The lang. */
|
102 | 881 | mdecorde | public String lang= "fr"; |
103 | 881 | mdecorde | public String currentForm; |
104 | 881 | mdecorde | public String currentAna; |
105 | 1094 | mdecorde | |
106 | 881 | mdecorde | /**
|
107 | 881 | mdecorde | * Sets the lang.
|
108 | 881 | mdecorde | *
|
109 | 881 | mdecorde | * @param lang the lang
|
110 | 881 | mdecorde | * @return the java.lang. object
|
111 | 881 | mdecorde | */
|
112 | 881 | mdecorde | public setLang(String lang) |
113 | 881 | mdecorde | { |
114 | 881 | mdecorde | this.lang = lang;
|
115 | 881 | mdecorde | } |
116 | 1094 | mdecorde | |
117 | 881 | mdecorde | /**
|
118 | 881 | mdecorde | * Instantiates a new xMLTX m2 cqp.
|
119 | 881 | mdecorde | *
|
120 | 881 | mdecorde | * @param url the url
|
121 | 881 | mdecorde | */
|
122 | 881 | mdecorde | public XMLTXM2CQP(URL url){ |
123 | 881 | mdecorde | try {
|
124 | 881 | mdecorde | this.url = url;
|
125 | 881 | mdecorde | inputData = url.openStream(); |
126 | 881 | mdecorde | factory = XMLInputFactory.newInstance(); |
127 | 1094 | mdecorde | |
128 | 881 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
129 | 1094 | mdecorde | |
130 | 1094 | mdecorde | |
131 | 881 | mdecorde | } catch (XMLStreamException ex) {
|
132 | 881 | mdecorde | System.out.println(ex);
|
133 | 881 | mdecorde | }catch (IOException ex) { |
134 | 881 | mdecorde | System.out.println("IOException while parsing "); |
135 | 881 | mdecorde | } |
136 | 881 | mdecorde | } |
137 | 1094 | mdecorde | |
138 | 881 | mdecorde | /**
|
139 | 881 | mdecorde | * Sets the text info.
|
140 | 881 | mdecorde | *
|
141 | 881 | mdecorde | * @param name the name
|
142 | 881 | mdecorde | * @param base the base
|
143 | 881 | mdecorde | * @param project the project
|
144 | 881 | mdecorde | */
|
145 | 881 | mdecorde | public void setTextInfo(String name, String base, String project) |
146 | 881 | mdecorde | { |
147 | 881 | mdecorde | this.addinfos = true; |
148 | 881 | mdecorde | this.txtname= name;
|
149 | 881 | mdecorde | this.base = base;
|
150 | 881 | mdecorde | this.project = project;
|
151 | 881 | mdecorde | } |
152 | 1094 | mdecorde | |
153 | 881 | mdecorde | /**
|
154 | 881 | mdecorde | * Creates the output.
|
155 | 881 | mdecorde | *
|
156 | 881 | mdecorde | * @param outfile the outfile
|
157 | 881 | mdecorde | * @return true, if successful
|
158 | 881 | mdecorde | */
|
159 | 881 | mdecorde | private boolean createOutput(File outfile) |
160 | 881 | mdecorde | { |
161 | 881 | mdecorde | try {
|
162 | 881 | mdecorde | output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile, outfile.exists()) , "UTF-8")); |
163 | 881 | mdecorde | return true; |
164 | 881 | mdecorde | } catch (Exception e) { |
165 | 881 | mdecorde | System.err.println(e);
|
166 | 881 | mdecorde | return false; |
167 | 881 | mdecorde | } |
168 | 881 | mdecorde | } |
169 | 1094 | mdecorde | |
170 | 881 | mdecorde | /** The haspb. */
|
171 | 881 | mdecorde | boolean haspb = false; |
172 | 1094 | mdecorde | |
173 | 881 | mdecorde | /** The haslb. */
|
174 | 881 | mdecorde | boolean haslb = false; |
175 | 1094 | mdecorde | |
176 | 881 | mdecorde | /**
|
177 | 881 | mdecorde | * Transform file.
|
178 | 881 | mdecorde | *
|
179 | 881 | mdecorde | * @param outfile the outfile
|
180 | 881 | mdecorde | * @return true, if successful
|
181 | 881 | mdecorde | */
|
182 | 881 | mdecorde | public boolean transformFile(File outfile) |
183 | 881 | mdecorde | { |
184 | 881 | mdecorde | if(balisesToKeep == null) |
185 | 881 | mdecorde | { |
186 | 881 | mdecorde | println "no element has been defined to be keeped"
|
187 | 881 | mdecorde | return false; |
188 | 881 | mdecorde | } |
189 | 1094 | mdecorde | |
190 | 881 | mdecorde | haspb = false;
|
191 | 1094 | mdecorde | haslb = false;
|
192 | 1094 | mdecorde | |
193 | 881 | mdecorde | boolean flagAna;
|
194 | 881 | mdecorde | boolean flagForm;
|
195 | 881 | mdecorde | boolean flagWord;
|
196 | 881 | mdecorde | String vWord = ""; |
197 | 881 | mdecorde | String vForm = ""; |
198 | 881 | mdecorde | String vAna = ""; |
199 | 1094 | mdecorde | |
200 | 881 | mdecorde | String lb_id = ""; |
201 | 881 | mdecorde | String pb_id = ""; |
202 | 1094 | mdecorde | |
203 | 881 | mdecorde | wordattributes = [:]; |
204 | 881 | mdecorde | balisesfound = new HashMap<String, List<String>>(); |
205 | 1094 | mdecorde | |
206 | 1094 | mdecorde | |
207 | 881 | mdecorde | if(!createOutput(outfile))
|
208 | 881 | mdecorde | return false; |
209 | 1094 | mdecorde | |
210 | 881 | mdecorde | if(sendToPAttributes != null) |
211 | 881 | mdecorde | { |
212 | 881 | mdecorde | for(String tag: sendToPAttributes.keySet()) |
213 | 881 | mdecorde | for(String attr : sendToPAttributes.get(tag)) |
214 | 881 | mdecorde | injectedPAttributes.add(tag+attr); |
215 | 881 | mdecorde | injectedPAttributesValues = [:]; |
216 | 881 | mdecorde | } |
217 | 1094 | mdecorde | |
218 | 881 | mdecorde | //output.write("<txmcorpus lang=\""+lang+"\">\n");
|
219 | 881 | mdecorde | balisesfound.put("txmcorpus",["lang"]); |
220 | 1094 | mdecorde | try {
|
221 | 1094 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
222 | 1094 | mdecorde | switch (event) {
|
223 | 1094 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
224 | 1094 | mdecorde | String localname = parser.getLocalName().toLowerCase();
|
225 | 1094 | mdecorde | |
226 | 1094 | mdecorde | // we will only declare found tags in cwb registry
|
227 | 1094 | mdecorde | if(balisesToKeep.contains(localname)) {
|
228 | 1094 | mdecorde | if(!balisesfound.containsKey(localname)) {
|
229 | 1094 | mdecorde | balisesfound.put(localname, []);
|
230 | 1094 | mdecorde | } |
231 | 1094 | mdecorde | |
232 | 1094 | mdecorde | List<String> attrlist = balisesfound.get(localname); |
233 | 1094 | mdecorde | for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) |
234 | 1094 | mdecorde | if(!attrlist.contains(parser.getAttributeLocalName(i)))
|
235 | 1094 | mdecorde | attrlist.add(parser.getAttributeLocalName(i)); |
236 | 881 | mdecorde | } |
237 | 1094 | mdecorde | |
238 | 1094 | mdecorde | switch (localname) {
|
239 | 1094 | mdecorde | case "w": // get word id !! |
240 | 1094 | mdecorde | wordattributes.put("id", parser.getAttributeValue(null, "id")); |
241 | 1094 | mdecorde | break;
|
242 | 1094 | mdecorde | |
243 | 1094 | mdecorde | case "form": |
244 | 1094 | mdecorde | flagForm = true;
|
245 | 1094 | mdecorde | currentForm = parser.getAttributeValue(null, "type"); |
246 | 1094 | mdecorde | if(currentForm == null) |
247 | 1094 | mdecorde | currentForm = "default";
|
248 | 1094 | mdecorde | vForm = "";
|
249 | 1094 | mdecorde | break;
|
250 | 1094 | mdecorde | |
251 | 1094 | mdecorde | case "ana": |
252 | 1094 | mdecorde | flagAna = true;
|
253 | 1094 | mdecorde | vAna ="";
|
254 | 1094 | mdecorde | |
255 | 1094 | mdecorde | currentAna = (parser.getAttributeValue(null,"type")); |
256 | 1094 | mdecorde | if(currentAna != null) |
257 | 1094 | mdecorde | currentAna = currentAna.substring(1)// remove the # |
258 | 1094 | mdecorde | else
|
259 | 1094 | mdecorde | flagAna = false;
|
260 | 1094 | mdecorde | break;
|
261 | 1094 | mdecorde | |
262 | 1094 | mdecorde | default:
|
263 | 1094 | mdecorde | |
264 | 1094 | mdecorde | if (sendToPAttributes != null) { |
265 | 1094 | mdecorde | //println "should store $localname ? with "+sendToPAttributes.keySet()
|
266 | 1094 | mdecorde | if (sendToPAttributes.keySet().contains(localname)) {
|
267 | 1094 | mdecorde | //println "store attr of "+localname
|
268 | 1094 | mdecorde | List<String> attrs = sendToPAttributes.get(localname); |
269 | 1094 | mdecorde | for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { |
270 | 1094 | mdecorde | if (attrs.contains(parser.getAttributeLocalName(i))) {
|
271 | 1094 | mdecorde | injectedPAttributesValues.put(localname+parser.getAttributeLocalName(i).toLowerCase(),parser.getAttributeValue(i)) |
272 | 1094 | mdecorde | } |
273 | 881 | mdecorde | } |
274 | 881 | mdecorde | } |
275 | 881 | mdecorde | } |
276 | 1094 | mdecorde | |
277 | 1094 | mdecorde | if (balisesToKeep.contains(localname)) {
|
278 | 1094 | mdecorde | output.write("<"+localname);
|
279 | 1094 | mdecorde | //println "write <"+localname+"..."
|
280 | 1094 | mdecorde | //write attributes
|
281 | 1094 | mdecorde | boolean idwritten = false; |
282 | 1094 | mdecorde | boolean basewritten = false; |
283 | 1094 | mdecorde | boolean projectwritten = false; |
284 | 1094 | mdecorde | for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { |
285 | 1094 | mdecorde | String attrname = parser.getAttributeLocalName(i).toLowerCase();
|
286 | 1094 | mdecorde | if (attrname == "id") |
287 | 1094 | mdecorde | idwritten = true;
|
288 | 1094 | mdecorde | if (attrname == "base") |
289 | 1094 | mdecorde | basewritten = true;
|
290 | 1094 | mdecorde | if (attrname == "project") |
291 | 1094 | mdecorde | projectwritten = true;
|
292 | 1395 | mdecorde | |
293 | 1395 | mdecorde | output.write(" "+attrname+"=\""+parser.getAttributeValue(i).replace("&", "&").replace("\"", """)+"\"" ); |
294 | 1094 | mdecorde | } |
295 | 1094 | mdecorde | |
296 | 1094 | mdecorde | if (localname.equals("text")) |
297 | 1094 | mdecorde | if (addinfos) {
|
298 | 1094 | mdecorde | List<String> attrlist = balisesfound.get(localname); |
299 | 1094 | mdecorde | |
300 | 1094 | mdecorde | if (!idwritten) {
|
301 | 1094 | mdecorde | output.write(" id=\""+txtname+"\"") |
302 | 1094 | mdecorde | attrlist.add("id");
|
303 | 1094 | mdecorde | } |
304 | 1094 | mdecorde | if (!basewritten) {
|
305 | 1094 | mdecorde | output.write(" base=\""+base+"\""); |
306 | 1094 | mdecorde | attrlist.add("base");
|
307 | 1094 | mdecorde | } |
308 | 1094 | mdecorde | if (!projectwritten) {
|
309 | 1094 | mdecorde | output.write(" project=\""+project+"\""); |
310 | 1094 | mdecorde | attrlist.add("project");
|
311 | 1094 | mdecorde | } |
312 | 1094 | mdecorde | } |
313 | 1094 | mdecorde | |
314 | 1094 | mdecorde | // finalize tag
|
315 | 1094 | mdecorde | output.write(">\n");
|
316 | 881 | mdecorde | } |
317 | 1094 | mdecorde | } |
318 | 1094 | mdecorde | break;
|
319 | 1094 | mdecorde | |
320 | 1094 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
321 | 1094 | mdecorde | String localname = parser.getLocalName().toLowerCase();
|
322 | 1094 | mdecorde | switch (localname) {
|
323 | 1094 | mdecorde | case "form": |
324 | 1094 | mdecorde | if(flagForm)
|
325 | 1094 | mdecorde | formhash.put(currentForm, vForm); |
326 | 1094 | mdecorde | flagForm = false;
|
327 | 1094 | mdecorde | break;
|
328 | 1094 | mdecorde | |
329 | 1094 | mdecorde | case "ana": |
330 | 1094 | mdecorde | if(flagAna)
|
331 | 1094 | mdecorde | anahash.put(currentAna, vAna); |
332 | 1094 | mdecorde | flagAna = false;
|
333 | 1094 | mdecorde | break;
|
334 | 1094 | mdecorde | |
335 | 1094 | mdecorde | case "w": |
336 | 1094 | mdecorde | vWord = "";
|
337 | 1094 | mdecorde | vWord = formhash.get("default").replaceAll("&", "&").replaceAll("<", "<"); // get default form |
338 | 1094 | mdecorde | for (String form : formhash.keySet()) // and the others |
339 | 1094 | mdecorde | if (form != "default") |
340 | 1094 | mdecorde | vWord += "\t"+formhash.get(form);
|
341 | 1094 | mdecorde | |
342 | 1094 | mdecorde | for (String type : wordattributes.keySet()) // only word id ? |
343 | 1094 | mdecorde | vWord+="\t"+wordattributes.get(type)
|
344 | 1094 | mdecorde | |
345 | 1094 | mdecorde | if (sendToPAttributes != null) // word attributes from structure properties |
346 | 1094 | mdecorde | { |
347 | 1094 | mdecorde | //println "injectedPAttributesValues: "+injectedPAttributesValues
|
348 | 1094 | mdecorde | for(String pattr : injectedPAttributes) |
349 | 1094 | mdecorde | vWord+="\t"+injectedPAttributesValues.get(pattr) ;//les attributs injecter |
350 | 1094 | mdecorde | } |
351 | 1094 | mdecorde | |
352 | 1094 | mdecorde | for (String type : anahash.keySet()) // word annotations in txm:ana |
353 | 1094 | mdecorde | vWord+="\t"+anahash.get(type)
|
354 | 1094 | mdecorde | |
355 | 1094 | mdecorde | output.write(vWord+"\n");
|
356 | 1094 | mdecorde | vWord= "";
|
357 | 1094 | mdecorde | break;
|
358 | 1094 | mdecorde | |
359 | 1094 | mdecorde | default:
|
360 | 1094 | mdecorde | if (sendToPAttributes != null) // reset structure properties |
361 | 1094 | mdecorde | { |
362 | 1094 | mdecorde | if (sendToPAttributes.keySet().contains(localname)) {
|
363 | 1094 | mdecorde | for (String attr : sendToPAttributes.get(localname)) { |
364 | 1094 | mdecorde | injectedPAttributesValues.put(attr, "N/A")
|
365 | 881 | mdecorde | } |
366 | 881 | mdecorde | } |
367 | 881 | mdecorde | } |
368 | 1094 | mdecorde | |
369 | 1094 | mdecorde | if (balisesToKeep.contains(localname)) {
|
370 | 1094 | mdecorde | output.write("</"+localname+">\n"); |
371 | 1094 | mdecorde | } |
372 | 1094 | mdecorde | } |
373 | 1094 | mdecorde | break;
|
374 | 1094 | mdecorde | |
375 | 1094 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
376 | 1094 | mdecorde | if (flagForm) {
|
377 | 1094 | mdecorde | vForm += parser.getText().trim(); |
378 | 1094 | mdecorde | } |
379 | 1094 | mdecorde | if (flagAna) {
|
380 | 1094 | mdecorde | vAna += parser.getText().trim(); |
381 | 1094 | mdecorde | } |
382 | 1094 | mdecorde | break;
|
383 | 1094 | mdecorde | } |
384 | 881 | mdecorde | } |
385 | 1094 | mdecorde | //output.write("</txmcorpus>\n");
|
386 | 1094 | mdecorde | output.close(); |
387 | 1688 | mdecorde | if (parser != null) parser.close(); |
388 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
389 | 1094 | mdecorde | } catch (Exception ex) { |
390 | 1094 | mdecorde | println "Error while parsing $url : "+ex
|
391 | 1094 | mdecorde | ex.printStackTrace(); |
392 | 1688 | mdecorde | if (parser != null) parser.close(); |
393 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
394 | 1094 | mdecorde | return false; |
395 | 881 | mdecorde | } |
396 | 881 | mdecorde | return true; |
397 | 881 | mdecorde | } |
398 | 1094 | mdecorde | |
399 | 881 | mdecorde | /**
|
400 | 881 | mdecorde | * Gets the p attributs.
|
401 | 881 | mdecorde | *
|
402 | 881 | mdecorde | * @return the p attributs
|
403 | 881 | mdecorde | */
|
404 | 881 | mdecorde | public List<String> getpAttributs() |
405 | 881 | mdecorde | { |
406 | 881 | mdecorde | def pAttributs = []; |
407 | 1094 | mdecorde | |
408 | 881 | mdecorde | for (String wordattr : wordattributes.keySet()) { |
409 | 881 | mdecorde | pAttributs.add(wordattr); |
410 | 881 | mdecorde | } |
411 | 1094 | mdecorde | |
412 | 881 | mdecorde | if (sendToPAttributes != null) |
413 | 881 | mdecorde | for (String pAttr : this.injectedPAttributes) |
414 | 881 | mdecorde | pAttributs.add(pAttr); |
415 | 1094 | mdecorde | |
416 | 881 | mdecorde | for (String anakey : anahash.keySet()) { |
417 | 881 | mdecorde | pAttributs.add(anakey); |
418 | 881 | mdecorde | } |
419 | 1094 | mdecorde | |
420 | 881 | mdecorde | return pAttributs;
|
421 | 881 | mdecorde | } |
422 | 1094 | mdecorde | |
423 | 881 | mdecorde | /**
|
424 | 881 | mdecorde | * Gets the s attributs.
|
425 | 881 | mdecorde | *
|
426 | 881 | mdecorde | * @return the s attributs
|
427 | 881 | mdecorde | */
|
428 | 881 | mdecorde | public List<String> getsAttributs() |
429 | 881 | mdecorde | { |
430 | 881 | mdecorde | println balisesfound |
431 | 881 | mdecorde | def sAttributs = []; |
432 | 881 | mdecorde | for (String balise : this.balisesfound.keySet()) { |
433 | 881 | mdecorde | List<String> sAtt = this.balisesfound.get(balise); |
434 | 881 | mdecorde | String attributes = ""; |
435 | 881 | mdecorde | for (String attr : sAtt) { |
436 | 881 | mdecorde | attributes+="+"+attr;
|
437 | 881 | mdecorde | } |
438 | 1094 | mdecorde | |
439 | 881 | mdecorde | if (sAtt.size() > 0) |
440 | 881 | mdecorde | sAttributs.add(balise +":"+attributes);
|
441 | 881 | mdecorde | else
|
442 | 881 | mdecorde | sAttributs.add(balise); |
443 | 881 | mdecorde | } |
444 | 881 | mdecorde | return sAttributs;
|
445 | 881 | mdecorde | } |
446 | 1094 | mdecorde | |
447 | 881 | mdecorde | /**
|
448 | 881 | mdecorde | * Sets the balises to keep.
|
449 | 881 | mdecorde | *
|
450 | 881 | mdecorde | * @param balisesToKeep the new balises to keep
|
451 | 881 | mdecorde | */
|
452 | 881 | mdecorde | public void setBalisesToKeep(List<String> balisesToKeep) |
453 | 881 | mdecorde | { |
454 | 881 | mdecorde | if (balisesToKeep != null) |
455 | 881 | mdecorde | this.balisesToKeep = balisesToKeep;
|
456 | 881 | mdecorde | else
|
457 | 881 | mdecorde | println("Warning: the list of elements to keep is null")
|
458 | 881 | mdecorde | } |
459 | 1094 | mdecorde | |
460 | 881 | mdecorde | /**
|
461 | 881 | mdecorde | * Sets the defautl reference pattern
|
462 | 881 | mdecorde | * TODO: not implemented
|
463 | 881 | mdecorde | *
|
464 | 881 | mdecorde | * @param balisesToKeep the new balises to keep
|
465 | 881 | mdecorde | */
|
466 | 881 | mdecorde | public void setDefaultReference(String pattern, List<String> strucProperties) |
467 | 881 | mdecorde | { |
468 | 881 | mdecorde | if (defaultReferencePattern != null) { |
469 | 881 | mdecorde | this.defaultReferences = defaultReferences;
|
470 | 881 | mdecorde | defaultReferencePattern = pattern; |
471 | 881 | mdecorde | } |
472 | 881 | mdecorde | } |
473 | 1094 | mdecorde | |
474 | 1094 | mdecorde | |
475 | 881 | mdecorde | /**
|
476 | 881 | mdecorde | * Sets the send to p attributes.
|
477 | 881 | mdecorde | *
|
478 | 881 | mdecorde | * @param sendus the sendus
|
479 | 881 | mdecorde | */
|
480 | 881 | mdecorde | public void setSendToPAttributes(HashMap<String, List<String>> sendus) |
481 | 881 | mdecorde | { |
482 | 881 | mdecorde | if (sendus != null) |
483 | 881 | mdecorde | this.sendToPAttributes = sendus;
|
484 | 881 | mdecorde | else
|
485 | 881 | mdecorde | println("Warning: the pAttributes to inject is null")
|
486 | 881 | mdecorde | } |
487 | 1094 | mdecorde | |
488 | 1094 | mdecorde | |
489 | 881 | mdecorde | /**
|
490 | 881 | mdecorde | * The main method.
|
491 | 881 | mdecorde | *
|
492 | 881 | mdecorde | * @param args the arguments
|
493 | 881 | mdecorde | */
|
494 | 881 | mdecorde | public static void main(String[] args) { |
495 | 1094 | mdecorde | |
496 | 881 | mdecorde | String rootDir = "/home/mdecorde/TXM/corpora/CORNEILLEMOLIERETER/txm/CORNEILLEMOLIERETER"; |
497 | 1094 | mdecorde | |
498 | 881 | mdecorde | File srcfile = new File(rootDir,"CORNEILLEP_AGESILAS_1666.xml"); |
499 | 881 | mdecorde | println srcfile.exists() |
500 | 881 | mdecorde | File cqpfile = new File(rootDir, "out/CORNEILLEP_AGESILAS_1666.cqp"); |
501 | 881 | mdecorde | new File(rootDir,"out").deleteDir() |
502 | 881 | mdecorde | new File(rootDir,"out").mkdir() |
503 | 1094 | mdecorde | |
504 | 881 | mdecorde | System.out.println("XMLTXM2CQP : "+srcfile+" >> "+cqpfile); |
505 | 881 | mdecorde | def builder = new XMLTXM2CQP(srcfile.toURL()); |
506 | 881 | mdecorde | def balises = ["text", "s"]; |
507 | 881 | mdecorde | builder.setBalisesToKeep(balises); |
508 | 881 | mdecorde | builder.transformFile(cqpfile); |
509 | 1094 | mdecorde | |
510 | 881 | mdecorde | println("SATTRIBUTS: "+builder.getsAttributs());
|
511 | 881 | mdecorde | println("PATTRIBUTS: "+builder.getpAttributs());
|
512 | 881 | mdecorde | return;
|
513 | 881 | mdecorde | } |
514 | 881 | mdecorde | } |