root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / xmltxmpara / compiler.groovy @ 1688
History | View | Annotate | Download (11.5 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | //
|
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | //
|
24 | 321 | mdecorde | // $LastChangedDate: 2011-06-03 12:37:57 +0200 (Fri, 03 Jun 2011) $
|
25 | 321 | mdecorde | // $LastChangedRevision: 1867 $
|
26 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 321 | mdecorde | //
|
28 | 321 | mdecorde | |
29 | 321 | mdecorde | |
30 | 986 | mdecorde | package org.txm.scripts.importer.xmltxmpara;
|
31 | 321 | mdecorde | |
32 | 321 | mdecorde | import java.util.ArrayList; |
33 | 321 | mdecorde | |
34 | 1000 | mdecorde | import org.txm.importer.cwb.BuildCwbEncodeArgs; |
35 | 1000 | mdecorde | import org.txm.importer.cwb.CwbEncode |
36 | 1000 | mdecorde | import org.txm.importer.cwb.CwbMakeAll |
37 | 986 | mdecorde | import org.txm.scripts.importer.*; |
38 | 321 | mdecorde | import org.txm.scripts.*; |
39 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.BuildTTSrc; |
40 | 1000 | mdecorde | import org.txm.importer.scripts.xmltxm.*; |
41 | 321 | mdecorde | import org.txm.utils.treetagger.TreeTagger; |
42 | 321 | mdecorde | |
43 | 321 | mdecorde | import javax.xml.stream.*; |
44 | 321 | mdecorde | import java.net.URL; |
45 | 321 | mdecorde | import java.io.File; |
46 | 321 | mdecorde | import java.util.HashMap; |
47 | 321 | mdecorde | import java.util.List; |
48 | 321 | mdecorde | |
49 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
50 | 321 | mdecorde | /**
|
51 | 321 | mdecorde | * The Class compiler.
|
52 | 321 | mdecorde | */
|
53 | 321 | mdecorde | class compiler |
54 | 321 | mdecorde | { |
55 | 321 | mdecorde | |
56 | 321 | mdecorde | /** The debug. */
|
57 | 321 | mdecorde | private boolean debug= false; |
58 | 321 | mdecorde | |
59 | 321 | mdecorde | /** The input data. */
|
60 | 321 | mdecorde | private def inputData; |
61 | 321 | mdecorde | |
62 | 321 | mdecorde | /** The factory. */
|
63 | 321 | mdecorde | private def factory; |
64 | 321 | mdecorde | |
65 | 321 | mdecorde | /** The parser. */
|
66 | 321 | mdecorde | private XMLStreamReader parser;
|
67 | 321 | mdecorde | |
68 | 321 | mdecorde | /** The dir. */
|
69 | 321 | mdecorde | private def dir; |
70 | 321 | mdecorde | |
71 | 321 | mdecorde | /** The output. */
|
72 | 321 | mdecorde | private Writer output; |
73 | 321 | mdecorde | |
74 | 321 | mdecorde | /** The url. */
|
75 | 321 | mdecorde | private def url; |
76 | 321 | mdecorde | |
77 | 321 | mdecorde | /** The text. */
|
78 | 321 | mdecorde | String text=""; |
79 | 321 | mdecorde | |
80 | 321 | mdecorde | /** The base. */
|
81 | 321 | mdecorde | String base=""; |
82 | 321 | mdecorde | |
83 | 321 | mdecorde | /** The project. */
|
84 | 321 | mdecorde | String project=""; |
85 | 321 | mdecorde | |
86 | 321 | mdecorde | /** The text attributes. */
|
87 | 321 | mdecorde | String[] textAttributes = null; |
88 | 321 | mdecorde | |
89 | 321 | mdecorde | /** The lang. */
|
90 | 321 | mdecorde | private String lang ="fr"; |
91 | 321 | mdecorde | |
92 | 321 | mdecorde | /** The anatypes. */
|
93 | 321 | mdecorde | private static ArrayList<String> anatypes; |
94 | 321 | mdecorde | |
95 | 321 | mdecorde | /** The s attribs. */
|
96 | 321 | mdecorde | private static HashMap<String, List<String>> sAttribs; |
97 | 321 | mdecorde | |
98 | 321 | mdecorde | /**
|
99 | 321 | mdecorde | * initialize.
|
100 | 321 | mdecorde | *
|
101 | 321 | mdecorde | */
|
102 | 321 | mdecorde | public compiler(){}
|
103 | 321 | mdecorde | |
104 | 321 | mdecorde | /**
|
105 | 321 | mdecorde | * Instantiates a new compiler.
|
106 | 321 | mdecorde | *
|
107 | 321 | mdecorde | * @param url the url
|
108 | 321 | mdecorde | * @param text the text
|
109 | 321 | mdecorde | * @param base the base
|
110 | 321 | mdecorde | * @param project the project
|
111 | 321 | mdecorde | */
|
112 | 321 | mdecorde | public compiler(URL url,String text,String base, String project) |
113 | 321 | mdecorde | { |
114 | 321 | mdecorde | this.text = text
|
115 | 321 | mdecorde | this.base = base;
|
116 | 321 | mdecorde | this.project = project;
|
117 | 321 | mdecorde | this.textAttributes = textAttributes;
|
118 | 321 | mdecorde | try {
|
119 | 321 | mdecorde | this.url = url;
|
120 | 321 | mdecorde | inputData = url.openStream(); |
121 | 321 | mdecorde | |
122 | 321 | mdecorde | factory = XMLInputFactory.newInstance(); |
123 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
124 | 321 | mdecorde | } catch (XMLStreamException ex) {
|
125 | 321 | mdecorde | System.out.println(ex);
|
126 | 321 | mdecorde | }catch (IOException ex) { |
127 | 321 | mdecorde | System.err.println("IOException while parsing "); |
128 | 321 | mdecorde | } |
129 | 321 | mdecorde | } |
130 | 321 | mdecorde | |
131 | 321 | mdecorde | /**
|
132 | 321 | mdecorde | * set the language of the corpus.
|
133 | 321 | mdecorde | *
|
134 | 321 | mdecorde | * @param lang the lang
|
135 | 321 | mdecorde | * @return the java.lang. object
|
136 | 321 | mdecorde | */
|
137 | 321 | mdecorde | public setLang(String lang) |
138 | 321 | mdecorde | { |
139 | 321 | mdecorde | this.lang = lang;
|
140 | 321 | mdecorde | } |
141 | 321 | mdecorde | |
142 | 321 | mdecorde | /**
|
143 | 321 | mdecorde | * Creates the output.
|
144 | 321 | mdecorde | *
|
145 | 321 | mdecorde | * @param dirPathName the dir path name
|
146 | 321 | mdecorde | * @param fileName the file name
|
147 | 321 | mdecorde | * @return true, if successful
|
148 | 321 | mdecorde | */
|
149 | 321 | mdecorde | private boolean createOutput(File f){ |
150 | 321 | mdecorde | try {
|
151 | 321 | mdecorde | output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8"); |
152 | 321 | mdecorde | return true; |
153 | 321 | mdecorde | } catch (Exception e) { |
154 | 321 | mdecorde | System.err.println(e);
|
155 | 321 | mdecorde | |
156 | 321 | mdecorde | return false; |
157 | 321 | mdecorde | } |
158 | 321 | mdecorde | } |
159 | 321 | mdecorde | |
160 | 321 | mdecorde | /**
|
161 | 321 | mdecorde | * Go to text.
|
162 | 321 | mdecorde | */
|
163 | 321 | mdecorde | private void GoToText() |
164 | 321 | mdecorde | { |
165 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) |
166 | 321 | mdecorde | { |
167 | 321 | mdecorde | if(event == XMLStreamConstants.END_ELEMENT)
|
168 | 321 | mdecorde | if(parser.getLocalName().equals("teiHeader")) |
169 | 321 | mdecorde | return;
|
170 | 321 | mdecorde | } |
171 | 321 | mdecorde | } |
172 | 321 | mdecorde | |
173 | 321 | mdecorde | /**
|
174 | 803 | mdecorde | * Transfom file cqp.
|
175 | 321 | mdecorde | *
|
176 | 321 | mdecorde | * @param dirPathName the dir path name
|
177 | 321 | mdecorde | * @param fileName the file name
|
178 | 321 | mdecorde | * @return true, if successful
|
179 | 321 | mdecorde | */
|
180 | 804 | mdecorde | public boolean transfomFileCqp(File cqpfile) |
181 | 321 | mdecorde | { |
182 | 803 | mdecorde | createOutput(cqpfile); |
183 | 321 | mdecorde | String headvalue="" |
184 | 321 | mdecorde | String vAna = ""; |
185 | 321 | mdecorde | String vForm = ""; |
186 | 321 | mdecorde | String wordid= ""; |
187 | 321 | mdecorde | String vHead = ""; |
188 | 321 | mdecorde | |
189 | 321 | mdecorde | int p_id = 0; |
190 | 321 | mdecorde | int s_id = 0; |
191 | 321 | mdecorde | |
192 | 321 | mdecorde | boolean captureword = false; |
193 | 321 | mdecorde | boolean flagForm = false; |
194 | 321 | mdecorde | boolean flagAna = false; |
195 | 321 | mdecorde | boolean inW = false; |
196 | 321 | mdecorde | int wcounter = 1; |
197 | 321 | mdecorde | GoToText(); |
198 | 321 | mdecorde | |
199 | 321 | mdecorde | try
|
200 | 321 | mdecorde | { |
201 | 321 | mdecorde | boolean stop = false; |
202 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT && !stop; event = parser.next()) |
203 | 321 | mdecorde | { |
204 | 321 | mdecorde | switch (event)
|
205 | 321 | mdecorde | { |
206 | 321 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
207 | 321 | mdecorde | |
208 | 321 | mdecorde | |
209 | 321 | mdecorde | switch (parser.getLocalName())
|
210 | 321 | mdecorde | { |
211 | 321 | mdecorde | case "w": |
212 | 321 | mdecorde | inW = true;
|
213 | 321 | mdecorde | wordid = parser.getAttributeValue(null, "id") |
214 | 321 | mdecorde | if(wordid == null) |
215 | 321 | mdecorde | wordid = "w_"+text+"_"+(wcounter++) |
216 | 321 | mdecorde | |
217 | 321 | mdecorde | vAna ="";
|
218 | 321 | mdecorde | break;
|
219 | 321 | mdecorde | |
220 | 321 | mdecorde | case "form": |
221 | 321 | mdecorde | String type = parser.getAttributeValue(null, "type"); |
222 | 321 | mdecorde | if(type == null) |
223 | 321 | mdecorde | { |
224 | 321 | mdecorde | flagForm = true;
|
225 | 321 | mdecorde | } |
226 | 321 | mdecorde | else if(type.equals("default")) |
227 | 321 | mdecorde | { |
228 | 321 | mdecorde | flagForm = true;
|
229 | 321 | mdecorde | } |
230 | 321 | mdecorde | else
|
231 | 321 | mdecorde | { |
232 | 321 | mdecorde | flagAna = true;
|
233 | 321 | mdecorde | vAna += "\t";
|
234 | 321 | mdecorde | if(!anatypes.contains(type))
|
235 | 321 | mdecorde | anatypes << type; |
236 | 321 | mdecorde | } |
237 | 321 | mdecorde | vForm = "";
|
238 | 321 | mdecorde | break;
|
239 | 321 | mdecorde | |
240 | 321 | mdecorde | case "ana": |
241 | 321 | mdecorde | flagAna = true;
|
242 | 321 | mdecorde | vAna += "\t";
|
243 | 321 | mdecorde | String type = parser.getAttributeValue(null, "type"); |
244 | 321 | mdecorde | |
245 | 321 | mdecorde | if(type != null) |
246 | 321 | mdecorde | { |
247 | 321 | mdecorde | if(type.startsWith("#")) |
248 | 321 | mdecorde | type = type.substring(1)
|
249 | 321 | mdecorde | if(!anatypes.contains(type))
|
250 | 321 | mdecorde | anatypes << type; |
251 | 321 | mdecorde | break;
|
252 | 321 | mdecorde | } |
253 | 321 | mdecorde | break;
|
254 | 321 | mdecorde | |
255 | 321 | mdecorde | default:
|
256 | 321 | mdecorde | if(!inW)
|
257 | 321 | mdecorde | { |
258 | 321 | mdecorde | output.write("<"+parser.getLocalName().toLowerCase());
|
259 | 321 | mdecorde | if(!sAttribs.containsKey(parser.getLocalName()))
|
260 | 321 | mdecorde | sAttribs.put(parser.getLocalName().toLowerCase(), []);
|
261 | 321 | mdecorde | |
262 | 321 | mdecorde | for( int i = 0 ; i < parser.getAttributeCount() ; i++) |
263 | 321 | mdecorde | { |
264 | 321 | mdecorde | String attrname = parser.getAttributeLocalName(i).toLowerCase();
|
265 | 321 | mdecorde | String attrvalue = parser.getAttributeValue(i);
|
266 | 321 | mdecorde | if(!(parser.getLocalName() == "text" && attrname == "id")) |
267 | 321 | mdecorde | output.write(" "+attrname+"=\""+attrvalue.replace("\"", "'")+"\""); |
268 | 321 | mdecorde | |
269 | 321 | mdecorde | if(!sAttribs.get(parser.getLocalName().toLowerCase()).contains(attrname))
|
270 | 321 | mdecorde | sAttribs.get(parser.getLocalName().toLowerCase()).add(attrname) |
271 | 321 | mdecorde | } |
272 | 321 | mdecorde | |
273 | 321 | mdecorde | if(parser.getLocalName() == "text") |
274 | 321 | mdecorde | { // add some infos
|
275 | 321 | mdecorde | output.write(" id=\""+text+"\" base=\""+base+"\" project=\""+project+"\""); |
276 | 321 | mdecorde | } |
277 | 321 | mdecorde | output.write(">\n");
|
278 | 321 | mdecorde | } |
279 | 321 | mdecorde | } |
280 | 321 | mdecorde | break;
|
281 | 321 | mdecorde | |
282 | 321 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
283 | 321 | mdecorde | |
284 | 321 | mdecorde | switch (parser.getLocalName())
|
285 | 321 | mdecorde | { |
286 | 321 | mdecorde | case "w": |
287 | 321 | mdecorde | output.write( vForm.replaceAll("&", "&").replaceAll("<", "<") +"\t"+wordid+vAna+"\n"); |
288 | 321 | mdecorde | vAna = "";
|
289 | 321 | mdecorde | vForm = "";
|
290 | 321 | mdecorde | inW = false;
|
291 | 321 | mdecorde | break;
|
292 | 321 | mdecorde | |
293 | 321 | mdecorde | case "form": |
294 | 321 | mdecorde | flagForm = false;
|
295 | 321 | mdecorde | flagAna = false;
|
296 | 321 | mdecorde | break;
|
297 | 321 | mdecorde | |
298 | 321 | mdecorde | case "ana": |
299 | 321 | mdecorde | flagAna = false;
|
300 | 321 | mdecorde | break;
|
301 | 321 | mdecorde | |
302 | 321 | mdecorde | default:
|
303 | 321 | mdecorde | if(!inW)
|
304 | 321 | mdecorde | output.write("</"+parser.getLocalName().toLowerCase()+">\n"); |
305 | 321 | mdecorde | if(parser.getLocalName() == "text") |
306 | 321 | mdecorde | stop = true;
|
307 | 321 | mdecorde | } |
308 | 321 | mdecorde | break;
|
309 | 321 | mdecorde | |
310 | 321 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
311 | 321 | mdecorde | if(inW)
|
312 | 321 | mdecorde | { |
313 | 321 | mdecorde | if(flagForm)
|
314 | 321 | mdecorde | if(flagAna)
|
315 | 321 | mdecorde | vAna += parser.getText().trim(); |
316 | 321 | mdecorde | else
|
317 | 321 | mdecorde | vForm += parser.getText().trim(); |
318 | 321 | mdecorde | if(flagAna)
|
319 | 321 | mdecorde | vAna += parser.getText().trim(); |
320 | 321 | mdecorde | } |
321 | 321 | mdecorde | break;
|
322 | 321 | mdecorde | } |
323 | 321 | mdecorde | } |
324 | 321 | mdecorde | |
325 | 321 | mdecorde | output.close(); |
326 | 1688 | mdecorde | if (parser != null) parser.close(); |
327 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
328 | 321 | mdecorde | } |
329 | 1688 | mdecorde | catch (Exception ex) { |
330 | 321 | mdecorde | System.out.println(ex);
|
331 | 1688 | mdecorde | if (parser != null) parser.close(); |
332 | 1688 | mdecorde | if (inputData != null) inputData.close(); |
333 | 321 | mdecorde | return false; |
334 | 321 | mdecorde | } |
335 | 321 | mdecorde | return true; |
336 | 321 | mdecorde | } |
337 | 321 | mdecorde | |
338 | 321 | mdecorde | |
339 | 321 | mdecorde | |
340 | 321 | mdecorde | /**
|
341 | 321 | mdecorde | * Run.
|
342 | 321 | mdecorde | *
|
343 | 321 | mdecorde | * @param rootDirFile the root dir file
|
344 | 321 | mdecorde | * @param basename the basename
|
345 | 321 | mdecorde | * @param textAttributes the text attributes
|
346 | 321 | mdecorde | * @return true, if successful
|
347 | 321 | mdecorde | */
|
348 | 321 | mdecorde | public boolean run(ArrayList<File> files, File binDir, String corpusname, String basename, String[] textAttributes) |
349 | 321 | mdecorde | { |
350 | 321 | mdecorde | anatypes = new ArrayList<String>();// init only 1 time |
351 | 321 | mdecorde | sAttribs = new HashMap<String, List<String>>();// init only 1 time |
352 | 321 | mdecorde | String rootDir = binDir.getAbsolutePath();
|
353 | 321 | mdecorde | |
354 | 714 | mdecorde | if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
|
355 | 714 | mdecorde | println ("Error: CWB executables not well set.")
|
356 | 321 | mdecorde | return false; |
357 | 321 | mdecorde | } |
358 | 321 | mdecorde | if(!binDir.exists())
|
359 | 321 | mdecorde | { |
360 | 321 | mdecorde | println ("binary directory does not exists: "+binDir)
|
361 | 321 | mdecorde | return false; |
362 | 321 | mdecorde | } |
363 | 321 | mdecorde | |
364 | 321 | mdecorde | if(files == null || files.size() == 0) |
365 | 321 | mdecorde | { |
366 | 321 | mdecorde | println "Error: no file to process"
|
367 | 321 | mdecorde | return false; |
368 | 321 | mdecorde | } |
369 | 321 | mdecorde | |
370 | 321 | mdecorde | String textid = ""; |
371 | 321 | mdecorde | int counttext = 0; |
372 | 803 | mdecorde | File cqpdir = new File(binDir,"cqp"); |
373 | 803 | mdecorde | File cqpfile = new File(cqpdir, corpusname.toLowerCase()+".cqp"); |
374 | 321 | mdecorde | //0 set Lang
|
375 | 803 | mdecorde | if(createOutput(cqpfile))
|
376 | 321 | mdecorde | { |
377 | 321 | mdecorde | output.write("<txmcorpus lang=\""+lang+"\">\n"); |
378 | 321 | mdecorde | output.close(); |
379 | 321 | mdecorde | } |
380 | 321 | mdecorde | |
381 | 804 | mdecorde | //1- Transform into CQP file
|
382 | 321 | mdecorde | def builder = null; |
383 | 321 | mdecorde | for (File f : files) { |
384 | 321 | mdecorde | counttext++; |
385 | 321 | mdecorde | if (!f.exists()) {
|
386 | 321 | mdecorde | println("The file "+f+ " does not exists") |
387 | 321 | mdecorde | } else {
|
388 | 321 | mdecorde | //println("process file "+f)
|
389 | 321 | mdecorde | String txtname = f.getName().substring(0,f.getName().length()-4); |
390 | 321 | mdecorde | builder = new compiler(f.toURL(), txtname, basename, "default"); |
391 | 321 | mdecorde | builder.setLang(lang); |
392 | 804 | mdecorde | if (!builder.transfomFileCqp(cqpfile))
|
393 | 321 | mdecorde | return false; |
394 | 321 | mdecorde | } |
395 | 321 | mdecorde | } |
396 | 321 | mdecorde | |
397 | 321 | mdecorde | //end corpus
|
398 | 803 | mdecorde | if(createOutput(cqpfile))
|
399 | 321 | mdecorde | { |
400 | 321 | mdecorde | output.write("</txmcorpus>\n");
|
401 | 321 | mdecorde | output.close(); |
402 | 321 | mdecorde | } |
403 | 321 | mdecorde | |
404 | 321 | mdecorde | //2- Import into CWB
|
405 | 321 | mdecorde | def outDir = rootDir;
|
406 | 321 | mdecorde | def outDirTxm = rootDir;
|
407 | 321 | mdecorde | |
408 | 321 | mdecorde | CwbEncode cwbEn = new CwbEncode();
|
409 | 321 | mdecorde | cwbEn.setDebug(debug); |
410 | 321 | mdecorde | CwbMakeAll cwbMa = new CwbMakeAll();
|
411 | 321 | mdecorde | cwbMa.setDebug(debug); |
412 | 321 | mdecorde | List<String> pargs = ["id"]; |
413 | 321 | mdecorde | for(String ana : anatypes) |
414 | 321 | mdecorde | pargs.add(ana); |
415 | 321 | mdecorde | |
416 | 321 | mdecorde | List<String> sargs = []; |
417 | 321 | mdecorde | //println "Found Sattributes "+this.sAttribs;
|
418 | 321 | mdecorde | if (sAttribs.containsKey("text")) { |
419 | 321 | mdecorde | if (!sAttribs.get("text").contains("id")) |
420 | 321 | mdecorde | sAttribs.get("text").add("id"); |
421 | 321 | mdecorde | if (!sAttribs.get("text").contains("base")) |
422 | 321 | mdecorde | sAttribs.get("text").add("base"); |
423 | 321 | mdecorde | if (!sAttribs.get("text").contains("project")) |
424 | 321 | mdecorde | sAttribs.get("text").add("project"); |
425 | 321 | mdecorde | } else {
|
426 | 321 | mdecorde | sargs.add("text:0+id+base+project")
|
427 | 321 | mdecorde | } |
428 | 321 | mdecorde | |
429 | 321 | mdecorde | if(sAttribs.containsKey("txmcorpus")) |
430 | 321 | mdecorde | { |
431 | 321 | mdecorde | if(!sAttribs.get("txmcorpus").contains("lang")) |
432 | 321 | mdecorde | sAttribs.get("txmcorpus").add("lang"); |
433 | 321 | mdecorde | } else {
|
434 | 321 | mdecorde | sargs.add("txmcorpus:0+lang")
|
435 | 321 | mdecorde | } |
436 | 321 | mdecorde | |
437 | 321 | mdecorde | for(String tag : this.sAttribs.keySet()) |
438 | 321 | mdecorde | { |
439 | 321 | mdecorde | String sAttr = tag;
|
440 | 321 | mdecorde | if(sAttribs.get(tag).size() > 0) |
441 | 321 | mdecorde | sAttr += ":";
|
442 | 321 | mdecorde | for(String attr : sAttribs.get(tag)) |
443 | 321 | mdecorde | sAttr +="+"+attr;
|
444 | 321 | mdecorde | sargs.add(sAttr) |
445 | 321 | mdecorde | } |
446 | 321 | mdecorde | |
447 | 321 | mdecorde | String[] sAttributes = sargs; |
448 | 321 | mdecorde | String[] pAttributes = pargs; |
449 | 321 | mdecorde | println "Corpus structural attributes: "+sAttributes;
|
450 | 321 | mdecorde | println "Corpus lexical attributes: "+pAttributes;
|
451 | 321 | mdecorde | try {
|
452 | 321 | mdecorde | String regPath = outDirTxm + "/registry/"+corpusname.toLowerCase(); |
453 | 714 | mdecorde | cwbEn.run( |
454 | 321 | mdecorde | outDirTxm + "/data/"+corpusname+"/", |
455 | 803 | mdecorde | cqpfile.getAbsolutePath(), |
456 | 321 | mdecorde | regPath, pAttributes, sAttributes); |
457 | 321 | mdecorde | if (!new File(regPath).exists()) { |
458 | 321 | mdecorde | println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
459 | 321 | mdecorde | return false; |
460 | 321 | mdecorde | } |
461 | 714 | mdecorde | cwbMa.run(corpusname.toUpperCase(), outDirTxm + "/registry");
|
462 | 321 | mdecorde | |
463 | 321 | mdecorde | } catch (Exception ex) {System.out.println(ex); return false;} |
464 | 321 | mdecorde | |
465 | 321 | mdecorde | System.out.println("Done.") |
466 | 321 | mdecorde | |
467 | 321 | mdecorde | return true; |
468 | 321 | mdecorde | } |
469 | 321 | mdecorde | |
470 | 321 | mdecorde | /**
|
471 | 321 | mdecorde | * Sets the debug.
|
472 | 321 | mdecorde | */
|
473 | 321 | mdecorde | public void setDebug() |
474 | 321 | mdecorde | { |
475 | 321 | mdecorde | this.debug = true; |
476 | 321 | mdecorde | } |
477 | 321 | mdecorde | |
478 | 321 | mdecorde | /**
|
479 | 321 | mdecorde | * The main method.
|
480 | 321 | mdecorde | *
|
481 | 321 | mdecorde | * @param args the arguments
|
482 | 321 | mdecorde | */
|
483 | 321 | mdecorde | public static void main(String[] args) |
484 | 321 | mdecorde | { |
485 | 321 | mdecorde | File dir = new File("~/xml/geo"); |
486 | 321 | mdecorde | def c = new compiler(); |
487 | 321 | mdecorde | c.setDebug(); |
488 | 321 | mdecorde | c.setCwbPath("~/TXM/cwb/bin");
|
489 | 321 | mdecorde | c.run(dir,"geo");
|
490 | 321 | mdecorde | } |
491 | 321 | mdecorde | } |