root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xml / compiler.groovy @ 1688
History | View | Annotate | Download (20 kB)
1 | 321 | mdecorde | |
---|---|---|---|
2 | 321 | mdecorde | |
3 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
4 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
5 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
6 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
7 | 321 | mdecorde | //
|
8 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
9 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
10 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
11 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
12 | 321 | mdecorde | // later version.
|
13 | 321 | mdecorde | //
|
14 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
15 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
16 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
17 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
18 | 321 | mdecorde | // details.
|
19 | 321 | mdecorde | //
|
20 | 321 | mdecorde | // You should have received a copy of the GNU General
|
21 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
22 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
23 | 321 | mdecorde | //
|
24 | 321 | mdecorde | //
|
25 | 321 | mdecorde | //
|
26 | 479 | mdecorde | // $LastChangedDate: 2016-05-26 17:42:36 +0200 (jeu. 26 mai 2016) $
|
27 | 321 | mdecorde | // $LastChangedRevision: 3219 $
|
28 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
29 | 321 | mdecorde | //
|
30 | 321 | mdecorde | |
31 | 321 | mdecorde | |
32 | 986 | mdecorde | package org.txm.scripts.importer.xml;
|
33 | 321 | mdecorde | |
34 | 321 | mdecorde | import java.util.ArrayList |
35 | 321 | mdecorde | import java.util.Collections |
36 | 1000 | mdecorde | import org.txm.importer.cwb.BuildCwbEncodeArgs |
37 | 1000 | mdecorde | import org.txm.importer.cwb.CwbEncode |
38 | 1000 | mdecorde | import org.txm.importer.cwb.CwbMakeAll |
39 | 986 | mdecorde | import org.txm.scripts.importer.* |
40 | 321 | mdecorde | import org.txm.scripts.* |
41 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.* |
42 | 321 | mdecorde | import org.txm.utils.treetagger.TreeTagger |
43 | 1110 | mdecorde | import org.txm.objects.* |
44 | 321 | mdecorde | import javax.xml.stream.* |
45 | 321 | mdecorde | |
46 | 321 | mdecorde | import java.net.URL |
47 | 321 | mdecorde | import java.io.File |
48 | 321 | mdecorde | import java.util.HashMap |
49 | 321 | mdecorde | import java.util.List |
50 | 321 | mdecorde | import java.util.HashMap |
51 | 321 | mdecorde | import java.util.HashSet |
52 | 321 | mdecorde | import org.txm.metadatas.* |
53 | 1613 | mdecorde | import org.txm.utils.ConsoleProgressBar |
54 | 479 | mdecorde | import org.txm.utils.io.FileCopy |
55 | 1110 | mdecorde | import org.txm.searchengine.cqp.corpus.* |
56 | 321 | mdecorde | |
57 | 321 | mdecorde | /**
|
58 | 321 | mdecorde | * The "compiler" Class of the XML/w import module.
|
59 | 321 | mdecorde | */
|
60 | 321 | mdecorde | class compiler { |
61 | 321 | mdecorde | |
62 | 321 | mdecorde | /** The debug. */
|
63 | 321 | mdecorde | private boolean debug= false; |
64 | 321 | mdecorde | |
65 | 321 | mdecorde | /** The input data. */
|
66 | 321 | mdecorde | private def inputData; |
67 | 321 | mdecorde | |
68 | 321 | mdecorde | /** The factory. */
|
69 | 321 | mdecorde | private def factory; |
70 | 321 | mdecorde | |
71 | 321 | mdecorde | /** The parser. */
|
72 | 321 | mdecorde | private XMLStreamReader parser;
|
73 | 321 | mdecorde | |
74 | 321 | mdecorde | /** The dir. */
|
75 | 321 | mdecorde | private def dir; |
76 | 321 | mdecorde | |
77 | 321 | mdecorde | /** The output. */
|
78 | 321 | mdecorde | private def output; |
79 | 321 | mdecorde | |
80 | 321 | mdecorde | /** The url. */
|
81 | 321 | mdecorde | private def url; |
82 | 321 | mdecorde | |
83 | 321 | mdecorde | /** The anatypes. */
|
84 | 321 | mdecorde | private static anatypes = [] |
85 | 321 | mdecorde | private static anavalues = [:] |
86 | 321 | mdecorde | |
87 | 321 | mdecorde | /** The anahash. */
|
88 | 321 | mdecorde | private HashMap<String, String> anahash = new HashMap<String, String>() ; |
89 | 321 | mdecorde | |
90 | 321 | mdecorde | private static SAttributesListener sattrsListener; |
91 | 321 | mdecorde | private static HashMap<String, ArrayList<String>> structs; |
92 | 321 | mdecorde | private static HashMap<String, Integer> structsProf; |
93 | 321 | mdecorde | |
94 | 321 | mdecorde | /** The text. */
|
95 | 321 | mdecorde | String text=""; |
96 | 321 | mdecorde | |
97 | 321 | mdecorde | /** The base. */
|
98 | 321 | mdecorde | String base=""; |
99 | 321 | mdecorde | |
100 | 321 | mdecorde | /** The text attributes. */
|
101 | 321 | mdecorde | String[] textAttributes = null; |
102 | 321 | mdecorde | |
103 | 321 | mdecorde | /** The lang. */
|
104 | 321 | mdecorde | private String lang ="fr"; |
105 | 321 | mdecorde | |
106 | 321 | mdecorde | public static sortMetadata = null; |
107 | 321 | mdecorde | public static normalizeMetadata = false; |
108 | 321 | mdecorde | |
109 | 321 | mdecorde | /**
|
110 | 321 | mdecorde | * initialize.
|
111 | 321 | mdecorde | *
|
112 | 321 | mdecorde | */
|
113 | 321 | mdecorde | public compiler(){}
|
114 | 321 | mdecorde | |
115 | 321 | mdecorde | public void setOptions(String sortmetadata, boolean normalizemetadata) |
116 | 321 | mdecorde | { |
117 | 321 | mdecorde | sortMetadata = sortmetadata; |
118 | 321 | mdecorde | normalizeMetadata = normalizemetadata; |
119 | 321 | mdecorde | } |
120 | 321 | mdecorde | |
121 | 321 | mdecorde | /**
|
122 | 321 | mdecorde | * Instantiates a new compiler.
|
123 | 321 | mdecorde | *
|
124 | 321 | mdecorde | * @param url the url
|
125 | 321 | mdecorde | * @param text the text
|
126 | 321 | mdecorde | * @param base the base
|
127 | 321 | mdecorde | * @param project the project
|
128 | 321 | mdecorde | */
|
129 | 1110 | mdecorde | public compiler(URL url, String text, String base, String projectName) |
130 | 321 | mdecorde | { |
131 | 321 | mdecorde | this.text = text
|
132 | 321 | mdecorde | this.base = base;
|
133 | 321 | mdecorde | this.textAttributes = textAttributes;
|
134 | 321 | mdecorde | try {
|
135 | 321 | mdecorde | this.url = url;
|
136 | 321 | mdecorde | inputData = url.openStream(); |
137 | 321 | mdecorde | |
138 | 321 | mdecorde | factory = XMLInputFactory.newInstance(); |
139 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
140 | 321 | mdecorde | |
141 | 321 | mdecorde | if (sattrsListener == null) |
142 | 321 | mdecorde | sattrsListener = new SAttributesListener(parser);
|
143 | 321 | mdecorde | else
|
144 | 321 | mdecorde | sattrsListener.start(parser) |
145 | 321 | mdecorde | |
146 | 321 | mdecorde | } catch (XMLStreamException ex) {
|
147 | 321 | mdecorde | System.out.println(ex);
|
148 | 321 | mdecorde | }catch (IOException ex) { |
149 | 321 | mdecorde | System.err.println("IOException while parsing "); |
150 | 321 | mdecorde | } |
151 | 321 | mdecorde | } |
152 | 321 | mdecorde | |
153 | 321 | mdecorde | /**
|
154 | 321 | mdecorde | * set the language of the corpus.
|
155 | 321 | mdecorde | *
|
156 | 321 | mdecorde | * @param lang the lang
|
157 | 321 | mdecorde | * @return the java.lang. object
|
158 | 321 | mdecorde | */
|
159 | 321 | mdecorde | public setLang(String lang) |
160 | 321 | mdecorde | { |
161 | 321 | mdecorde | this.lang = lang;
|
162 | 321 | mdecorde | } |
163 | 321 | mdecorde | |
164 | 321 | mdecorde | /** The annotation success. */
|
165 | 321 | mdecorde | boolean annotationSuccess = false; |
166 | 321 | mdecorde | |
167 | 321 | mdecorde | /**
|
168 | 321 | mdecorde | * Sets the annotation success.
|
169 | 321 | mdecorde | *
|
170 | 321 | mdecorde | * @param val the new annotation success
|
171 | 321 | mdecorde | */
|
172 | 321 | mdecorde | public void setAnnotationSuccess(boolean val) |
173 | 321 | mdecorde | { |
174 | 321 | mdecorde | this.annotationSuccess = val;
|
175 | 321 | mdecorde | } |
176 | 321 | mdecorde | |
177 | 321 | mdecorde | /**
|
178 | 321 | mdecorde | * Creates the output.
|
179 | 321 | mdecorde | *
|
180 | 321 | mdecorde | * @param dirPathName the dir path name
|
181 | 321 | mdecorde | * @param fileName the file name
|
182 | 321 | mdecorde | * @return true, if successful
|
183 | 321 | mdecorde | */
|
184 | 321 | mdecorde | private boolean createOutput(File f){ |
185 | 321 | mdecorde | try {
|
186 | 1613 | mdecorde | output = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(f, f.exists())) , "UTF-8"); |
187 | 321 | mdecorde | return true; |
188 | 321 | mdecorde | } catch (Exception e) { |
189 | 321 | mdecorde | System.err.println(e);
|
190 | 321 | mdecorde | return false; |
191 | 321 | mdecorde | } |
192 | 321 | mdecorde | } |
193 | 321 | mdecorde | |
194 | 321 | mdecorde | /**
|
195 | 321 | mdecorde | * Go to text.
|
196 | 321 | mdecorde | */
|
197 | 321 | mdecorde | private void GoToText() |
198 | 321 | mdecorde | { |
199 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
200 | 321 | mdecorde | if (event == XMLStreamConstants.END_ELEMENT)
|
201 | 321 | mdecorde | if (parser.getLocalName().equals("teiHeader")) |
202 | 321 | mdecorde | return;
|
203 | 321 | mdecorde | } |
204 | 321 | mdecorde | } |
205 | 321 | mdecorde | |
206 | 321 | mdecorde | /**
|
207 | 803 | mdecorde | * Transfom file cqp.
|
208 | 321 | mdecorde | *
|
209 | 321 | mdecorde | * @param dirPathName the dir path name
|
210 | 321 | mdecorde | * @param fileName the file name
|
211 | 321 | mdecorde | * @return true, if successful
|
212 | 321 | mdecorde | */
|
213 | 1110 | mdecorde | public boolean transfomFileCqp(Project project, File cqpFile, HashMap<String, String> textmetadata) |
214 | 321 | mdecorde | { |
215 | 803 | mdecorde | if (!createOutput(cqpFile))
|
216 | 321 | mdecorde | return false; |
217 | 321 | mdecorde | |
218 | 321 | mdecorde | String headvalue="" |
219 | 321 | mdecorde | String vAna = ""; |
220 | 321 | mdecorde | String vForm = ""; |
221 | 321 | mdecorde | String wordid= ""; |
222 | 321 | mdecorde | String vHead = ""; |
223 | 321 | mdecorde | |
224 | 321 | mdecorde | int p_id = 0; |
225 | 321 | mdecorde | int s_id = 0; |
226 | 321 | mdecorde | |
227 | 321 | mdecorde | def divs = [] |
228 | 321 | mdecorde | def ncounts = [:] // contains the n values per tags with no attribute |
229 | 321 | mdecorde | |
230 | 321 | mdecorde | boolean captureword = false; |
231 | 321 | mdecorde | boolean flagForm = false; |
232 | 321 | mdecorde | boolean flagAna = false; |
233 | 321 | mdecorde | |
234 | 321 | mdecorde | String anatype = ""; |
235 | 321 | mdecorde | String anavalue = ""; |
236 | 321 | mdecorde | boolean stopAtFirstSort = true; |
237 | 321 | mdecorde | boolean foundtei = false; |
238 | 321 | mdecorde | boolean foundtext = false; |
239 | 321 | mdecorde | //output.write("<txmcorpus lang=\""+lang+"\">\n");
|
240 | 321 | mdecorde | try {
|
241 | 321 | mdecorde | String localname;
|
242 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
243 | 321 | mdecorde | { |
244 | 321 | mdecorde | switch (event) {
|
245 | 321 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
246 | 321 | mdecorde | localname = parser.getLocalName().toLowerCase(); |
247 | 321 | mdecorde | if ("tei".equals(localname)) foundtei = true; |
248 | 321 | mdecorde | switch (localname) {
|
249 | 321 | mdecorde | case "text": |
250 | 321 | mdecorde | sattrsListener.startElement(localname); |
251 | 321 | mdecorde | foundtext = true;
|
252 | 1110 | mdecorde | output.write("<text id=\""+text+"\" base=\""+base+"\"" + " project=\""+project.getName()+"\""); |
253 | 321 | mdecorde | // for (String name : textmetadata.keySet())
|
254 | 321 | mdecorde | // output.write(" "+name+"=\""+textmetadata.get(name)+"\"")
|
255 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
256 | 321 | mdecorde | String attrname = parser.getAttributeLocalName(i);
|
257 | 1395 | mdecorde | String attrvalue = parser.getAttributeValue(i).replaceAll("\"", """) |
258 | 321 | mdecorde | if (normalizeMetadata)
|
259 | 321 | mdecorde | attrvalue = attrvalue.toLowerCase(); |
260 | 321 | mdecorde | if (attrname != "id") |
261 | 321 | mdecorde | output.write(" "+attrname.toLowerCase()+"=\""+attrvalue+"\"") |
262 | 321 | mdecorde | } |
263 | 321 | mdecorde | output.write(">\n");
|
264 | 321 | mdecorde | |
265 | 321 | mdecorde | // if (textAttributes == null) {
|
266 | 321 | mdecorde | // textAttributes = new String[parser.getAttributeCount()];
|
267 | 321 | mdecorde | //
|
268 | 321 | mdecorde | // for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
269 | 321 | mdecorde | // textAttributes[i]=parser.getAttributeLocalName(i).toLowerCase();
|
270 | 321 | mdecorde | // }
|
271 | 321 | mdecorde | // }
|
272 | 321 | mdecorde | |
273 | 321 | mdecorde | break;
|
274 | 321 | mdecorde | |
275 | 321 | mdecorde | |
276 | 321 | mdecorde | case "w": |
277 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
278 | 321 | mdecorde | if (parser.getAttributeLocalName(i).equals("id")) { |
279 | 321 | mdecorde | wordid = parser.getAttributeValue(i); |
280 | 321 | mdecorde | } |
281 | 321 | mdecorde | } |
282 | 321 | mdecorde | anavalues = [:]; |
283 | 321 | mdecorde | break;
|
284 | 321 | mdecorde | case "form": |
285 | 321 | mdecorde | flagForm = true;
|
286 | 321 | mdecorde | vForm = "";
|
287 | 321 | mdecorde | vAna ="";
|
288 | 321 | mdecorde | break;
|
289 | 321 | mdecorde | |
290 | 321 | mdecorde | case "ana": |
291 | 321 | mdecorde | flagAna = true;
|
292 | 321 | mdecorde | anavalue = "";
|
293 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) |
294 | 321 | mdecorde | if ("type".equals(parser.getAttributeLocalName(i))) { |
295 | 321 | mdecorde | anatype = parser.getAttributeValue(i).substring(1);//remove the # |
296 | 321 | mdecorde | break;
|
297 | 321 | mdecorde | } |
298 | 321 | mdecorde | break;
|
299 | 321 | mdecorde | |
300 | 321 | mdecorde | default:
|
301 | 321 | mdecorde | // if ("div" == localname ) {
|
302 | 321 | mdecorde | // def type = localname;
|
303 | 321 | mdecorde | // for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
|
304 | 321 | mdecorde | // String attrname = parser.getAttributeLocalName(i);
|
305 | 321 | mdecorde | // if ("type".equals(attrname)) {
|
306 | 321 | mdecorde | // type= parser.getAttributeValue(i)
|
307 | 321 | mdecorde | // }
|
308 | 321 | mdecorde | // }
|
309 | 321 | mdecorde | // divs << type;
|
310 | 321 | mdecorde | // localname = type
|
311 | 321 | mdecorde | // }
|
312 | 321 | mdecorde | |
313 | 321 | mdecorde | if (foundtei && !foundtext) break; |
314 | 321 | mdecorde | |
315 | 321 | mdecorde | sattrsListener.startElement(localname); |
316 | 321 | mdecorde | output.write("<"+localname);
|
317 | 321 | mdecorde | |
318 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
319 | 321 | mdecorde | String attrname = parser.getAttributeLocalName(i);
|
320 | 321 | mdecorde | String attrvalue = parser.getAttributeValue(i)
|
321 | 321 | mdecorde | if (normalizeMetadata)
|
322 | 321 | mdecorde | attrvalue = attrvalue.toLowerCase(); |
323 | 1395 | mdecorde | output.write(" "+attrname.toLowerCase()+"=\""+attrvalue.replaceAll("\"", """)+"\"") |
324 | 321 | mdecorde | } |
325 | 321 | mdecorde | if (parser.getAttributeCount() == 0) { // add the n attribute |
326 | 321 | mdecorde | if (!ncounts.containsKey(localname)) ncounts.put(localname, 0); |
327 | 321 | mdecorde | int ncount = ncounts.get(localname);
|
328 | 321 | mdecorde | ncounts.put(localname, ncount+1);
|
329 | 321 | mdecorde | output.write(" n=\""+ncount+"\"") |
330 | 321 | mdecorde | } |
331 | 321 | mdecorde | output.write(">\n");
|
332 | 321 | mdecorde | } |
333 | 321 | mdecorde | break;
|
334 | 321 | mdecorde | |
335 | 321 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
336 | 321 | mdecorde | localname = parser.getLocalName().toLowerCase(); |
337 | 321 | mdecorde | switch (localname) {
|
338 | 321 | mdecorde | case "w": |
339 | 321 | mdecorde | for (String type : anatypes) { |
340 | 321 | mdecorde | def v = anavalues.get(type);
|
341 | 321 | mdecorde | if (v != null) vAna +="\t"+v; |
342 | 321 | mdecorde | else vAna +="\t"; |
343 | 321 | mdecorde | } |
344 | 321 | mdecorde | vForm = vForm.replaceAll("\n", "").replaceAll("&", "&").replaceAll("<", "<"); |
345 | 321 | mdecorde | if (vAna != null) { |
346 | 321 | mdecorde | output.write(vForm+"\t"+wordid+vAna+"\n"); |
347 | 321 | mdecorde | } |
348 | 321 | mdecorde | vAna = "";
|
349 | 321 | mdecorde | vForm = "";
|
350 | 321 | mdecorde | break;
|
351 | 321 | mdecorde | |
352 | 321 | mdecorde | case "tei": |
353 | 321 | mdecorde | break;
|
354 | 321 | mdecorde | case "form": |
355 | 321 | mdecorde | flagForm = false;
|
356 | 321 | mdecorde | break;
|
357 | 321 | mdecorde | case "ana": |
358 | 321 | mdecorde | anavalues.put(anatype, anavalue) |
359 | 321 | mdecorde | flagAna = false;
|
360 | 321 | mdecorde | break;
|
361 | 321 | mdecorde | default:
|
362 | 321 | mdecorde | if (foundtei && !foundtext) break; |
363 | 321 | mdecorde | |
364 | 321 | mdecorde | // if ("div" == localname && divs.size() > 0) {
|
365 | 321 | mdecorde | // localname = divs.pop()
|
366 | 321 | mdecorde | // }
|
367 | 321 | mdecorde | |
368 | 321 | mdecorde | sattrsListener.endElement(localname); |
369 | 321 | mdecorde | output.write("</"+localname+">\n"); |
370 | 321 | mdecorde | } |
371 | 321 | mdecorde | break;
|
372 | 321 | mdecorde | |
373 | 321 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
374 | 321 | mdecorde | if (flagForm)
|
375 | 321 | mdecorde | vForm += parser.getText().trim(); |
376 | 321 | mdecorde | if (flagAna) {
|
377 | 321 | mdecorde | if (normalizeMetadata)
|
378 | 321 | mdecorde | anavalue += parser.getText().trim().toLowerCase(); |
379 | 321 | mdecorde | else
|
380 | 321 | mdecorde | anavalue += parser.getText().trim(); |
381 | 321 | mdecorde | } |
382 | 321 | mdecorde | break;
|
383 | 321 | mdecorde | } |
384 | 321 | mdecorde | } |
385 | 321 | mdecorde | //output.write("</txmcorpus>");
|
386 | 321 | mdecorde | output.close(); |
387 | 1688 | mdecorde | if (parser != null) parser.close(); |
388 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
389 | 321 | mdecorde | } catch (Exception ex) { |
390 | 321 | mdecorde | System.out.println("Exception while parsing " + inputData+" of Text "+text); |
391 | 321 | mdecorde | File xmlFile = null |
392 | 321 | mdecorde | File errorDir = null |
393 | 321 | mdecorde | try {
|
394 | 321 | mdecorde | xmlFile = new File(url.getFile()) |
395 | 803 | mdecorde | errorDir = new File(cqpFile.getParentFile(), "compiler-error") |
396 | 321 | mdecorde | println "Warning: Moving $xmlFile to $errorDir"
|
397 | 321 | mdecorde | errorDir.mkdir(); |
398 | 321 | mdecorde | FileCopy.copy(xmlFile, new File(errorDir, xmlFile.getName())) |
399 | 321 | mdecorde | } catch(Exception eCopy) { |
400 | 321 | mdecorde | println "Error while moving "+url+" to "+errorDir |
401 | 321 | mdecorde | } |
402 | 1688 | mdecorde | if (parser != null) parser.close(); |
403 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
404 | 321 | mdecorde | return false; |
405 | 321 | mdecorde | } |
406 | 321 | mdecorde | return true; |
407 | 321 | mdecorde | } |
408 | 321 | mdecorde | |
409 | 321 | mdecorde | private void getAnaTypes(File xmlFile) { |
410 | 321 | mdecorde | inputData = xmlFile.toURI().toURL().openStream(); |
411 | 321 | mdecorde | factory = XMLInputFactory.newInstance(); |
412 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
413 | 321 | mdecorde | String ana = "ana" |
414 | 321 | mdecorde | HashSet<String> types = new HashSet<String>(); |
415 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
416 | 321 | mdecorde | if (event == XMLStreamConstants.START_ELEMENT) { // start elem |
417 | 321 | mdecorde | if (ana.equals(parser.getLocalName())) { // ana elem |
418 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { // find @type |
419 | 321 | mdecorde | if ("type".equals(parser.getAttributeLocalName(i))) { // @type |
420 | 321 | mdecorde | types.add(parser.getAttributeValue(i).substring(1)); //remove the # |
421 | 321 | mdecorde | break;
|
422 | 321 | mdecorde | } |
423 | 321 | mdecorde | } |
424 | 321 | mdecorde | } |
425 | 321 | mdecorde | } |
426 | 321 | mdecorde | } |
427 | 1688 | mdecorde | |
428 | 1688 | mdecorde | if (parser != null) parser.close(); |
429 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
430 | 321 | mdecorde | |
431 | 321 | mdecorde | for (String type : types) |
432 | 321 | mdecorde | if (!anatypes.contains(type))
|
433 | 321 | mdecorde | anatypes << type |
434 | 321 | mdecorde | } |
435 | 321 | mdecorde | |
436 | 321 | mdecorde | /**
|
437 | 321 | mdecorde | * Run.
|
438 | 321 | mdecorde | *
|
439 | 321 | mdecorde | * @param rootDirFile the root dir file
|
440 | 321 | mdecorde | * @param basename the basename
|
441 | 321 | mdecorde | * @param textAttributes the text attributes
|
442 | 321 | mdecorde | * @param srcfiles the srcfiles
|
443 | 321 | mdecorde | * @return true, if successful
|
444 | 321 | mdecorde | */
|
445 | 1110 | mdecorde | public boolean run(Project project, File binDir, File txmDir, String corpusname, String[] textAttributes, def srcfiles, Metadatas metadatas) |
446 | 321 | mdecorde | { |
447 | 321 | mdecorde | sattrsListener = null; // reset SAttribute Listener for each new import |
448 | 321 | mdecorde | String rootDir = binDir.getAbsolutePath();
|
449 | 321 | mdecorde | anatypes = [] // reset |
450 | 321 | mdecorde | anavalues = [:] // reset
|
451 | 714 | mdecorde | |
452 | 714 | mdecorde | if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
|
453 | 1110 | mdecorde | println ("Error: CWB executables rights are not well setted.")
|
454 | 321 | mdecorde | return false; |
455 | 321 | mdecorde | } |
456 | 1110 | mdecorde | |
457 | 1110 | mdecorde | CorpusBuild corpus = project.getCorpusBuild(project.getName()); |
458 | 1110 | mdecorde | if (corpus != null) { |
459 | 1110 | mdecorde | //println "CLEAN PREVIOUS CORPUS"
|
460 | 1110 | mdecorde | corpus.delete(); // remove old files
|
461 | 321 | mdecorde | } |
462 | 1110 | mdecorde | |
463 | 1110 | mdecorde | // make new one
|
464 | 1110 | mdecorde | corpus = new MainCorpus(project);
|
465 | 1110 | mdecorde | corpus.setID(project.getName()); |
466 | 1110 | mdecorde | corpus.setName(project.getName()); |
467 | 1110 | mdecorde | corpus.setDescription("Built with the XML/w import module");
|
468 | 1110 | mdecorde | |
469 | 1110 | mdecorde | File cqpFile = new File(binDir,"cqp/"+corpusname+".cqp"); |
470 | 1395 | mdecorde | cqpFile.delete() |
471 | 1395 | mdecorde | |
472 | 1110 | mdecorde | new File(binDir,"cqp").mkdirs() |
473 | 1110 | mdecorde | new File(binDir,"data").mkdirs() |
474 | 1110 | mdecorde | new File(binDir,"registry").mkdirs() |
475 | 321 | mdecorde | |
476 | 321 | mdecorde | String textid = "" |
477 | 321 | mdecorde | int counttext = 0 |
478 | 1110 | mdecorde | List<File> files = txmDir.listFiles(new FileFilter() { |
479 | 1110 | mdecorde | public boolean accept(File f) { |
480 | 1110 | mdecorde | return !f.isDirectory() && !f.isHidden() && f.getName().endsWith(".xml"); |
481 | 1110 | mdecorde | } |
482 | 1110 | mdecorde | }); |
483 | 804 | mdecorde | //1- Transform into CQP file
|
484 | 321 | mdecorde | def builder = null |
485 | 321 | mdecorde | |
486 | 321 | mdecorde | //start corpus
|
487 | 803 | mdecorde | if (createOutput(cqpFile)) {
|
488 | 321 | mdecorde | output.write("<txmcorpus lang=\""+lang+"\">\n") |
489 | 321 | mdecorde | output.close() |
490 | 321 | mdecorde | } |
491 | 321 | mdecorde | |
492 | 321 | mdecorde | // sort files
|
493 | 321 | mdecorde | if (sortMetadata == null) { |
494 | 321 | mdecorde | Collections.sort(files)
|
495 | 321 | mdecorde | } else {
|
496 | 321 | mdecorde | HashMap<File, String> sortmetadatavalues = new HashMap<File, String>() |
497 | 321 | mdecorde | for (File f : files) { |
498 | 321 | mdecorde | String value = MetadataGetter.get(f,"text", sortMetadata) |
499 | 321 | mdecorde | sortmetadatavalues.put(f, value) |
500 | 321 | mdecorde | } |
501 | 321 | mdecorde | println "sort properties value: "+sortmetadatavalues
|
502 | 321 | mdecorde | Collections.sort(files, new Comparator<File>() { |
503 | 321 | mdecorde | /**
|
504 | 321 | mdecorde | * Compare.
|
505 | 321 | mdecorde | *
|
506 | 321 | mdecorde | * @param o1 the o1
|
507 | 321 | mdecorde | * @param o2 the o2
|
508 | 321 | mdecorde | * @return the int
|
509 | 321 | mdecorde | */
|
510 | 321 | mdecorde | public int compare(Object o1, Object o2) { |
511 | 321 | mdecorde | String v1 = sortmetadatavalues.get((File)o1) |
512 | 321 | mdecorde | String v2 = sortmetadatavalues.get((File)o2) |
513 | 321 | mdecorde | if (v1 == null || v2 == null) return 0; |
514 | 321 | mdecorde | return v1.compareTo(v2)
|
515 | 321 | mdecorde | } |
516 | 321 | mdecorde | }); |
517 | 321 | mdecorde | } |
518 | 321 | mdecorde | |
519 | 321 | mdecorde | // get all anatypes
|
520 | 321 | mdecorde | for (File f : files) { |
521 | 321 | mdecorde | getAnaTypes(f) |
522 | 321 | mdecorde | } |
523 | 321 | mdecorde | |
524 | 1613 | mdecorde | println("Compiling "+files.size()+" files ") |
525 | 1613 | mdecorde | ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
|
526 | 321 | mdecorde | for (File f : files) { |
527 | 1613 | mdecorde | cpb.tick() |
528 | 321 | mdecorde | HashMap<String, String> textmetadata; |
529 | 321 | mdecorde | if (metadatas != null) |
530 | 321 | mdecorde | textmetadata = metadatas.getTextMetadata(f) |
531 | 321 | mdecorde | else
|
532 | 321 | mdecorde | textmetadata = [:] |
533 | 321 | mdecorde | |
534 | 321 | mdecorde | counttext++; |
535 | 321 | mdecorde | if (!f.exists()) {
|
536 | 321 | mdecorde | println("file "+f+ " does not exists") |
537 | 321 | mdecorde | } else {
|
538 | 321 | mdecorde | String txtname = f.getName().substring(0,f.getName().length()-4) |
539 | 321 | mdecorde | builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default") |
540 | 321 | mdecorde | builder.setLang(lang); |
541 | 1110 | mdecorde | if (!builder.transfomFileCqp(project, cqpFile, textmetadata)) {
|
542 | 321 | mdecorde | println("Failed to compile "+f)
|
543 | 321 | mdecorde | } |
544 | 321 | mdecorde | } |
545 | 321 | mdecorde | } |
546 | 1613 | mdecorde | cpb.done() |
547 | 1613 | mdecorde | |
548 | 321 | mdecorde | //end corpus
|
549 | 803 | mdecorde | if (createOutput(cqpFile)) {
|
550 | 321 | mdecorde | output.write("</txmcorpus>\n")
|
551 | 321 | mdecorde | output.close() |
552 | 321 | mdecorde | } |
553 | 321 | mdecorde | println ""
|
554 | 321 | mdecorde | //2- Import into CWB
|
555 | 321 | mdecorde | def outDir = rootDir
|
556 | 321 | mdecorde | |
557 | 321 | mdecorde | CwbEncode cwbEn = new CwbEncode()
|
558 | 321 | mdecorde | cwbEn.setDebug(debug) |
559 | 321 | mdecorde | CwbMakeAll cwbMa = new CwbMakeAll()
|
560 | 321 | mdecorde | cwbMa.setDebug(debug) |
561 | 321 | mdecorde | |
562 | 321 | mdecorde | List<String> pargs = [] |
563 | 321 | mdecorde | pargs.add("id")
|
564 | 321 | mdecorde | for (String ana : anatypes) |
565 | 321 | mdecorde | pargs.add(ana) |
566 | 321 | mdecorde | |
567 | 321 | mdecorde | String[] pAttrs = pargs |
568 | 321 | mdecorde | |
569 | 321 | mdecorde | structs = sattrsListener.getStructs() |
570 | 321 | mdecorde | structsProf = sattrsListener.getProfs() |
571 | 321 | mdecorde | |
572 | 321 | mdecorde | if (debug) {
|
573 | 321 | mdecorde | println structs |
574 | 321 | mdecorde | println structsProf |
575 | 321 | mdecorde | } |
576 | 321 | mdecorde | |
577 | 321 | mdecorde | List<String> sargs = new ArrayList<String>() |
578 | 321 | mdecorde | def tmpTextAttrs = [] |
579 | 321 | mdecorde | for (String name : structs.keySet()) { |
580 | 321 | mdecorde | if (name == "text") { |
581 | 321 | mdecorde | for (String value : structs.get(name)) // append the attributes |
582 | 321 | mdecorde | tmpTextAttrs << value // added after
|
583 | 321 | mdecorde | continue;
|
584 | 321 | mdecorde | } |
585 | 321 | mdecorde | //if ( name == "q") continue; // added after
|
586 | 321 | mdecorde | //if ( name == "foreign") continue; // added after
|
587 | 321 | mdecorde | String concat = name+":"+structsProf.get(name); // append the depth |
588 | 321 | mdecorde | for (String attributeName : structs.get(name)) // append the attributes |
589 | 321 | mdecorde | concat += "+"+attributeName.toLowerCase();
|
590 | 321 | mdecorde | |
591 | 321 | mdecorde | if (structs.get(name).size() == 0) { |
592 | 321 | mdecorde | concat += "+n";
|
593 | 321 | mdecorde | } else {
|
594 | 321 | mdecorde | if (!structs.get(name).contains("n")) |
595 | 321 | mdecorde | concat += "+n"
|
596 | 321 | mdecorde | } |
597 | 321 | mdecorde | |
598 | 321 | mdecorde | if ((name == "p" || name == "body" || name == "back" || name == "front") |
599 | 321 | mdecorde | && !concat.contains("+n+") && !concat.endsWith("+n")) |
600 | 321 | mdecorde | concat += "+n"
|
601 | 321 | mdecorde | |
602 | 321 | mdecorde | sargs.add(concat) |
603 | 321 | mdecorde | } |
604 | 321 | mdecorde | |
605 | 321 | mdecorde | String textSAttributes = "text:0+id+base+project"; |
606 | 321 | mdecorde | for (String name : tmpTextAttrs) { |
607 | 321 | mdecorde | if (!("id".equals(name) || "base".equals(name) || "project".equals(name))) |
608 | 321 | mdecorde | textSAttributes += "+"+name.toLowerCase()
|
609 | 321 | mdecorde | } |
610 | 321 | mdecorde | // if (metadataXPath != null) {
|
611 | 321 | mdecorde | // for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv
|
612 | 321 | mdecorde | // textSAttributes+="+"+meta;
|
613 | 321 | mdecorde | // }
|
614 | 321 | mdecorde | sargs.add(textSAttributes) |
615 | 321 | mdecorde | sargs.add("txmcorpus:0+lang")
|
616 | 321 | mdecorde | |
617 | 321 | mdecorde | sargs.sort() |
618 | 321 | mdecorde | |
619 | 321 | mdecorde | String[] sAttributes = sargs |
620 | 321 | mdecorde | String[] pAttributes = pAttrs |
621 | 321 | mdecorde | println "P-attributes: "+pAttributes
|
622 | 321 | mdecorde | println "S-attributes: "+sargs
|
623 | 321 | mdecorde | |
624 | 321 | mdecorde | //if(!annotationSuccess)
|
625 | 321 | mdecorde | //pAttributes = ["id"];
|
626 | 321 | mdecorde | |
627 | 321 | mdecorde | //println "PATTRIBUTES : "+pargs;
|
628 | 321 | mdecorde | /*
|
629 | 321 | mdecorde | ArrayList<String> wordstag = ["w"];
|
630 | 321 | mdecorde | println "Getting structural attributes..."
|
631 | 321 | mdecorde | BuildCwbEncodeArgs argsgetter = new BuildCwbEncodeArgs();
|
632 | 321 | mdecorde | HashMap<String, HashSet<String>> allStructures = new HashMap<String, HashSet<String>>();
|
633 | 321 | mdecorde | HashMap<String, Integer> allStructuresInclusion = new HashMap<String, Integer>();
|
634 | 321 | mdecorde | for (File srcfile: txmDir.listFiles()) {
|
635 | 321 | mdecorde | if (!(!srcfile.getName().endsWith(".csv") && srcfile.canRead() && !srcfile.isHidden() && !srcfile.isDirectory() && ValidateXml.test(srcfile)))
|
636 | 321 | mdecorde | continue;
|
637 | 321 | mdecorde | print "."
|
638 | 321 | mdecorde | argsgetter.process(srcfile, wordstag);
|
639 | 321 | mdecorde | for (String sattr : argsgetter.getSAttributes()) {
|
640 | 321 | mdecorde | int idx = sattr.indexOf(":");
|
641 | 321 | mdecorde | if(idx < 0 )
|
642 | 321 | mdecorde | continue;
|
643 | 321 | mdecorde | String name = sattr.substring(0, idx);
|
644 | 321 | mdecorde | if (!allStructures.containsKey(name)) {
|
645 | 321 | mdecorde | allStructures.put(name, new HashSet<String>());
|
646 | 321 | mdecorde | allStructuresInclusion.put(name, 0);
|
647 | 321 | mdecorde | }
|
648 | 321 | mdecorde | //println "sattr: "+name
|
649 | 321 | mdecorde | String attrs = sattr.substring(idx+1);
|
650 | 321 | mdecorde | String[] split = attrs.split("\\+");
|
651 | 321 | mdecorde | if (split.length > 0) {
|
652 | 321 | mdecorde | int start = 1;
|
653 | 321 | mdecorde | try {// test if first attr is a number
|
654 | 321 | mdecorde | int n = Integer.parseInt(split[0]);
|
655 | 321 | mdecorde | if (n > allStructuresInclusion.get(name))
|
656 | 321 | mdecorde | allStructuresInclusion.put(name, n);
|
657 | 321 | mdecorde | } catch(Exception e) {start = 0;}
|
658 | 321 | mdecorde | for (int i = start ; i < split.length ; i++)
|
659 | 321 | mdecorde | allStructures.get(name).add(split[i]);
|
660 | 321 | mdecorde | }
|
661 | 321 | mdecorde | }
|
662 | 321 | mdecorde | }
|
663 | 321 | mdecorde | // add structures+properties found in sources
|
664 | 321 | mdecorde | List<String> sargs = new ArrayList<String>();
|
665 | 321 | mdecorde | for (String name : allStructuresInclusion.keySet()) {
|
666 | 321 | mdecorde | String concat = name+":"+allStructuresInclusion.get(name);
|
667 | 321 | mdecorde | for (String value : allStructures.get(name))
|
668 | 321 | mdecorde | concat += "+"+value;
|
669 | 321 | mdecorde | if (name.equals("text")) {
|
670 | 321 | mdecorde | concat += "+base+project"
|
671 | 321 | mdecorde | if (!concat.contains("id"))
|
672 | 321 | mdecorde | concat += "+id";
|
673 | 321 | mdecorde | }
|
674 | 321 | mdecorde | sargs.add(concat);
|
675 | 321 | mdecorde | }*/
|
676 | 321 | mdecorde | |
677 | 321 | mdecorde | // for (int i = 0 ; i < sargs.size() ; i++) {
|
678 | 321 | mdecorde | // if (sargs.get(i).startsWith("text:")) {
|
679 | 321 | mdecorde | // String str = sargs.get(i);
|
680 | 321 | mdecorde | // sargs.set(i, "text:"+str.substring(6));
|
681 | 321 | mdecorde | // }
|
682 | 321 | mdecorde | // }
|
683 | 321 | mdecorde | |
684 | 321 | mdecorde | // String textSAttributes = "text:0+id+base+project";
|
685 | 321 | mdecorde | // if (metadatas != null) {
|
686 | 321 | mdecorde | // for (String meta : metadatas.getHeadersList()) // text property declarations from metadata.csv
|
687 | 321 | mdecorde | // textSAttributes+="+"+meta;
|
688 | 321 | mdecorde | // }
|
689 | 321 | mdecorde | //sargs.add(textSAttributes)
|
690 | 321 | mdecorde | //sargs.add("txmcorpus:0+lang")
|
691 | 321 | mdecorde | |
692 | 321 | mdecorde | // String[] sAttributes = sargs;
|
693 | 321 | mdecorde | // System.out.println("\nCorpus structures: "+sAttributes);
|
694 | 321 | mdecorde | // System.out.println("corpus word properties: "+pAttributes);
|
695 | 321 | mdecorde | |
696 | 321 | mdecorde | try {
|
697 | 321 | mdecorde | String regPath = outDir + "/registry/"+corpusname.toLowerCase(); |
698 | 714 | mdecorde | cwbEn.run( |
699 | 321 | mdecorde | outDir + "/data/$corpusname",
|
700 | 803 | mdecorde | outDir + "/cqp/"+corpusname+".cqp", |
701 | 321 | mdecorde | regPath, pAttributes, sAttributes); |
702 | 321 | mdecorde | if (!new File(regPath).exists()) { |
703 | 321 | mdecorde | println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
704 | 321 | mdecorde | return false; |
705 | 321 | mdecorde | } |
706 | 714 | mdecorde | cwbMa.run(corpusname, outDir + "/registry");
|
707 | 321 | mdecorde | } catch (Exception ex) {System.out.println(ex); return false;} |
708 | 321 | mdecorde | |
709 | 321 | mdecorde | return true; |
710 | 321 | mdecorde | } |
711 | 321 | mdecorde | |
712 | 321 | mdecorde | /**
|
713 | 321 | mdecorde | * Sets the debug.
|
714 | 321 | mdecorde | */
|
715 | 321 | mdecorde | public void setDebug() |
716 | 321 | mdecorde | { |
717 | 321 | mdecorde | this.debug = true; |
718 | 321 | mdecorde | } |
719 | 321 | mdecorde | |
720 | 321 | mdecorde | /**
|
721 | 321 | mdecorde | * The main method.
|
722 | 321 | mdecorde | *
|
723 | 321 | mdecorde | * @param args the arguments
|
724 | 321 | mdecorde | */
|
725 | 321 | mdecorde | public static void main(String[] args) |
726 | 321 | mdecorde | { |
727 | 321 | mdecorde | File dir = new File("~/xml/geo"); |
728 | 321 | mdecorde | def c = new compiler(); |
729 | 321 | mdecorde | c.setDebug(); |
730 | 321 | mdecorde | c.setCwbPath("~/TXM/cwb/bin");
|
731 | 321 | mdecorde | c.run(dir,"geo");
|
732 | 321 | mdecorde | } |
733 | 321 | mdecorde | } |