root / tmp / org.txm.groovy.core / src / groovy / org / txm / importer / bfm / compiler.groovy @ 966
History | View | Annotate | Download (26.7 kB)
1 | 321 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 321 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 321 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 321 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 321 | mdecorde | //
|
6 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 321 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 321 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 321 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 321 | mdecorde | // later version.
|
11 | 321 | mdecorde | //
|
12 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 321 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 321 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 321 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 321 | mdecorde | // details.
|
17 | 321 | mdecorde | //
|
18 | 321 | mdecorde | // You should have received a copy of the GNU General
|
19 | 321 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 321 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 321 | mdecorde | |
22 | 321 | mdecorde | //
|
23 | 321 | mdecorde | // This file is part of the TXM platform.
|
24 | 321 | mdecorde | //
|
25 | 321 | mdecorde | // The TXM platform is free software: you can redistribute it and/or modif y
|
26 | 321 | mdecorde | // it under the terms of the GNU General Public License as published by
|
27 | 321 | mdecorde | // the Free Software Foundation, either version 3 of the License, or
|
28 | 321 | mdecorde | // (at your option) any later version.
|
29 | 321 | mdecorde | //
|
30 | 321 | mdecorde | // The TXM platform is distributed in the hope that it will be useful,
|
31 | 321 | mdecorde | // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
32 | 321 | mdecorde | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
33 | 321 | mdecorde | // GNU General Public License for more details.
|
34 | 321 | mdecorde | //
|
35 | 321 | mdecorde | // You should have received a copy of the GNU General Public License
|
36 | 321 | mdecorde | // along with the TXM platform. If not, see <http://www.gnu.org/licenses/>.
|
37 | 321 | mdecorde | //
|
38 | 321 | mdecorde | //
|
39 | 321 | mdecorde | //
|
40 | 479 | mdecorde | // $LastChangedDate: 2017-01-24 18:11:42 +0100 (mar. 24 janv. 2017) $
|
41 | 321 | mdecorde | // $LastChangedRevision: 3400 $
|
42 | 321 | mdecorde | // $LastChangedBy: mdecorde $
|
43 | 321 | mdecorde | //
|
44 | 321 | mdecorde | package org.txm.importer.bfm
|
45 | 321 | mdecorde | |
46 | 321 | mdecorde | import org.txm.Toolbox; |
47 | 321 | mdecorde | import org.txm.importer.cwb.* |
48 | 321 | mdecorde | import org.txm.importer.*; |
49 | 321 | mdecorde | import org.txm.scripts.*; |
50 | 927 | mdecorde | import org.txm.importer.xmltxm.BuildTTSrc; |
51 | 927 | mdecorde | import org.txm.importer.xmltxm.*; |
52 | 321 | mdecorde | import org.txm.utils.treetagger.TreeTagger; |
53 | 321 | mdecorde | |
54 | 321 | mdecorde | import javax.xml.stream.*; |
55 | 321 | mdecorde | import java.net.URL; |
56 | 321 | mdecorde | import java.io.File; |
57 | 321 | mdecorde | import java.util.Comparator; |
58 | 321 | mdecorde | import java.util.HashMap; |
59 | 321 | mdecorde | import java.util.List; |
60 | 321 | mdecorde | |
61 | 321 | mdecorde | // TODO: Auto-generated Javadoc
|
62 | 321 | mdecorde | /**
|
63 | 804 | mdecorde | * Produce CQP files from the TEI-TXM files. <br/>
|
64 | 321 | mdecorde | * - Read texts metadata with XPath queries <br/>
|
65 | 321 | mdecorde | * - Add the following word properties : sic, abbr, orig, lb and pb <br/>
|
66 | 321 | mdecorde | * - Keep <front>, <body> and <back> for each text <br/>
|
67 | 321 | mdecorde | * - Text enclosed in <q> is tokenized <br/>
|
68 | 321 | mdecorde | *
|
69 | 321 | mdecorde | * @author mdecorde
|
70 | 321 | mdecorde | *
|
71 | 321 | mdecorde | */
|
72 | 321 | mdecorde | class compiler { |
73 | 321 | mdecorde | /** The debug. */
|
74 | 321 | mdecorde | private boolean debug= false; |
75 | 321 | mdecorde | |
76 | 321 | mdecorde | /** The annotate_status. */
|
77 | 321 | mdecorde | private boolean annotate_status=true; |
78 | 321 | mdecorde | |
79 | 321 | mdecorde | /** The input data. */
|
80 | 321 | mdecorde | private def inputData; |
81 | 321 | mdecorde | |
82 | 321 | mdecorde | /** The factory. */
|
83 | 321 | mdecorde | private def factory; |
84 | 321 | mdecorde | |
85 | 321 | mdecorde | /** The parser. */
|
86 | 321 | mdecorde | private XMLStreamReader parser;
|
87 | 321 | mdecorde | |
88 | 321 | mdecorde | /** The dir. */
|
89 | 321 | mdecorde | private def dir; |
90 | 321 | mdecorde | |
91 | 321 | mdecorde | /** The output. */
|
92 | 321 | mdecorde | private def output; |
93 | 321 | mdecorde | |
94 | 321 | mdecorde | /** The url. */
|
95 | 321 | mdecorde | private def url; |
96 | 321 | mdecorde | |
97 | 321 | mdecorde | /** The anahash. */
|
98 | 321 | mdecorde | static boolean firstWord = true; |
99 | 321 | mdecorde | static private def anaTypes = []; |
100 | 321 | mdecorde | private HashMap<String,String> anahash = new HashMap<String,String>() ; |
101 | 321 | mdecorde | |
102 | 321 | mdecorde | private static SAttributesListener sattrsListener; |
103 | 321 | mdecorde | private static HashMap<String,ArrayList<String>> structs; |
104 | 321 | mdecorde | private static HashMap<String, Integer> structsProf; |
105 | 321 | mdecorde | |
106 | 321 | mdecorde | /** The text. */
|
107 | 321 | mdecorde | private String text=""; |
108 | 321 | mdecorde | |
109 | 321 | mdecorde | /** The base. */
|
110 | 321 | mdecorde | private String base=""; |
111 | 321 | mdecorde | |
112 | 321 | mdecorde | /** The project. */
|
113 | 321 | mdecorde | private String project=""; |
114 | 321 | mdecorde | |
115 | 321 | mdecorde | /** The lang. */
|
116 | 321 | mdecorde | private String lang ="fr"; |
117 | 321 | mdecorde | |
118 | 321 | mdecorde | /**
|
119 | 321 | mdecorde | * contains the metadata xpath organize per name
|
120 | 321 | mdecorde | */
|
121 | 321 | mdecorde | Properties metadataXPath;
|
122 | 321 | mdecorde | |
123 | 321 | mdecorde | /**
|
124 | 321 | mdecorde | * initialize.
|
125 | 321 | mdecorde | *
|
126 | 321 | mdecorde | */
|
127 | 321 | mdecorde | public compiler(){
|
128 | 321 | mdecorde | firstWord = true;
|
129 | 321 | mdecorde | anaTypes = [];
|
130 | 321 | mdecorde | } |
131 | 321 | mdecorde | |
132 | 321 | mdecorde | /**
|
133 | 321 | mdecorde | * initialize the compiler.
|
134 | 321 | mdecorde | *
|
135 | 321 | mdecorde | * @param url the file to process
|
136 | 321 | mdecorde | * @param text the Texte's name
|
137 | 321 | mdecorde | * @param base the base's name
|
138 | 321 | mdecorde | * @param project the Project's name
|
139 | 321 | mdecorde | */
|
140 | 321 | mdecorde | public compiler(URL url,String text,String base, String project, Properties metadataXPath) |
141 | 321 | mdecorde | { |
142 | 321 | mdecorde | this.metadataXPath = metadataXPath;
|
143 | 321 | mdecorde | this.text = text
|
144 | 321 | mdecorde | this.base = base;
|
145 | 321 | mdecorde | this.project = project;
|
146 | 321 | mdecorde | try {
|
147 | 321 | mdecorde | this.url = url;
|
148 | 321 | mdecorde | inputData = url.openStream(); |
149 | 321 | mdecorde | |
150 | 321 | mdecorde | factory = XMLInputFactory.newInstance(); |
151 | 321 | mdecorde | parser = factory.createXMLStreamReader(inputData); |
152 | 321 | mdecorde | if (sattrsListener == null) |
153 | 321 | mdecorde | sattrsListener = new SAttributesListener(parser);
|
154 | 321 | mdecorde | else
|
155 | 321 | mdecorde | sattrsListener.start(parser) |
156 | 321 | mdecorde | } catch (XMLStreamException ex) {
|
157 | 321 | mdecorde | System.out.println(ex);
|
158 | 321 | mdecorde | }catch (IOException ex) { |
159 | 321 | mdecorde | System.out.println("IOException while parsing "); |
160 | 321 | mdecorde | } |
161 | 321 | mdecorde | } |
162 | 321 | mdecorde | |
163 | 321 | mdecorde | ArrayList<File> orderedFiles; |
164 | 321 | mdecorde | public ArrayList<File> getOrderedTxmFiles() { |
165 | 321 | mdecorde | return orderedFiles;
|
166 | 321 | mdecorde | } |
167 | 321 | mdecorde | |
168 | 321 | mdecorde | /**
|
169 | 321 | mdecorde | * Sets the lang.
|
170 | 321 | mdecorde | *
|
171 | 321 | mdecorde | * @param lang the new lang
|
172 | 321 | mdecorde | */
|
173 | 321 | mdecorde | public void setLang(String lang) |
174 | 321 | mdecorde | { |
175 | 321 | mdecorde | this.lang = lang;
|
176 | 321 | mdecorde | } |
177 | 321 | mdecorde | |
178 | 321 | mdecorde | /**
|
179 | 321 | mdecorde | * Sets the annotation done.
|
180 | 321 | mdecorde | *
|
181 | 321 | mdecorde | * @param done the new annotation done
|
182 | 321 | mdecorde | */
|
183 | 321 | mdecorde | public void setAnnotationDone(boolean done) |
184 | 321 | mdecorde | { |
185 | 321 | mdecorde | this.annotate_status = done;
|
186 | 321 | mdecorde | } |
187 | 321 | mdecorde | |
188 | 321 | mdecorde | /**
|
189 | 321 | mdecorde | * Creates the output.
|
190 | 321 | mdecorde | *
|
191 | 321 | mdecorde | * @param dirPathName the dir path name
|
192 | 321 | mdecorde | * @param fileName the file name
|
193 | 321 | mdecorde | * @return true, if successful
|
194 | 321 | mdecorde | */
|
195 | 321 | mdecorde | private boolean createOutput(File f) { |
196 | 321 | mdecorde | try {
|
197 | 321 | mdecorde | output = new OutputStreamWriter(new FileOutputStream(f,f.exists()) , "UTF-8"); |
198 | 321 | mdecorde | } catch (Exception e) { |
199 | 321 | mdecorde | System.out.println(e.getLocalizedMessage());
|
200 | 321 | mdecorde | return false; |
201 | 321 | mdecorde | } |
202 | 321 | mdecorde | return true; |
203 | 321 | mdecorde | } |
204 | 321 | mdecorde | |
205 | 321 | mdecorde | /**
|
206 | 321 | mdecorde | * Go to text.
|
207 | 321 | mdecorde | */
|
208 | 321 | mdecorde | private void GoToText() |
209 | 321 | mdecorde | { |
210 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
211 | 321 | mdecorde | if (event == XMLStreamConstants.END_ELEMENT)
|
212 | 321 | mdecorde | if (parser.getLocalName().equals("teiHeader")) |
213 | 321 | mdecorde | return;
|
214 | 321 | mdecorde | } |
215 | 321 | mdecorde | } |
216 | 321 | mdecorde | |
217 | 321 | mdecorde | /**
|
218 | 321 | mdecorde | * Increment.
|
219 | 321 | mdecorde | *
|
220 | 321 | mdecorde | * @param parser the parser
|
221 | 321 | mdecorde | * @param value the value
|
222 | 321 | mdecorde | * @return the java.lang. object
|
223 | 321 | mdecorde | */
|
224 | 321 | mdecorde | private def increment(XMLStreamReader parser, int value) |
225 | 321 | mdecorde | { |
226 | 321 | mdecorde | String n=null; |
227 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
228 | 321 | mdecorde | //System.out.println("attr name "+parser.getAttributeLocalName(i));
|
229 | 321 | mdecorde | if (parser.getAttributeLocalName(i) == "n") { |
230 | 321 | mdecorde | n = parser.getAttributeValue(i); |
231 | 321 | mdecorde | break;
|
232 | 321 | mdecorde | } |
233 | 321 | mdecorde | } |
234 | 321 | mdecorde | //System.out.println("inc n "+n);
|
235 | 321 | mdecorde | if (n != null) |
236 | 321 | mdecorde | try {
|
237 | 321 | mdecorde | value = Integer.parseInt(n);
|
238 | 321 | mdecorde | return value;
|
239 | 321 | mdecorde | } |
240 | 321 | mdecorde | catch (Exception e) {return value+1;} |
241 | 321 | mdecorde | |
242 | 321 | mdecorde | value = value+1;
|
243 | 321 | mdecorde | return value;
|
244 | 321 | mdecorde | } |
245 | 321 | mdecorde | |
246 | 321 | mdecorde | /**
|
247 | 803 | mdecorde | * Transfom file cqp.
|
248 | 321 | mdecorde | *
|
249 | 321 | mdecorde | * @param dirPathName the dir path name
|
250 | 321 | mdecorde | * @param fileName the file name
|
251 | 321 | mdecorde | * @return true, if successful
|
252 | 321 | mdecorde | */
|
253 | 804 | mdecorde | private boolean transfomFileCqp(File cqpFile) |
254 | 321 | mdecorde | { |
255 | 321 | mdecorde | try {
|
256 | 803 | mdecorde | if (!createOutput(cqpFile)) return false; |
257 | 321 | mdecorde | |
258 | 321 | mdecorde | String headvalue = "" |
259 | 321 | mdecorde | String vAna = ""; |
260 | 321 | mdecorde | String vForm = ""; |
261 | 321 | mdecorde | String wordid = ""; |
262 | 321 | mdecorde | String vHead = ""; |
263 | 321 | mdecorde | |
264 | 321 | mdecorde | Integer p_id = 0; |
265 | 321 | mdecorde | Integer s_id = 0; |
266 | 321 | mdecorde | Integer q_id = 0; |
267 | 321 | mdecorde | int sp_id = 0; |
268 | 321 | mdecorde | Integer body_id = 0; |
269 | 321 | mdecorde | Integer front_id = 0; |
270 | 321 | mdecorde | Integer back_id = 0; |
271 | 321 | mdecorde | Integer lb_id = 0; |
272 | 321 | mdecorde | Integer pb_id = 0; |
273 | 321 | mdecorde | Integer ab_id = 0; |
274 | 321 | mdecorde | int foreign_id = 0; |
275 | 321 | mdecorde | int name_id = 0; |
276 | 321 | mdecorde | |
277 | 321 | mdecorde | boolean captureword = false; |
278 | 321 | mdecorde | |
279 | 321 | mdecorde | String vExpan = ""; |
280 | 321 | mdecorde | String vCorr = ""; |
281 | 321 | mdecorde | String vReg = ""; |
282 | 321 | mdecorde | String vOrig = ""; |
283 | 321 | mdecorde | String vSic = ""; |
284 | 321 | mdecorde | String vAbbr = ""; |
285 | 321 | mdecorde | String givenpos = ""; |
286 | 321 | mdecorde | String pb_n = ""; |
287 | 321 | mdecorde | String foreign_lang = ""; |
288 | 321 | mdecorde | String nameType = ""; |
289 | 321 | mdecorde | String anaType;
|
290 | 321 | mdecorde | //String abType = "";
|
291 | 321 | mdecorde | |
292 | 321 | mdecorde | boolean foundtei=false, foundtext=false; |
293 | 321 | mdecorde | |
294 | 321 | mdecorde | boolean flaglg = false; |
295 | 321 | mdecorde | int levelq = 0; |
296 | 321 | mdecorde | //boolean flagq = false;
|
297 | 321 | mdecorde | boolean flaghead = false; |
298 | 321 | mdecorde | //Added:
|
299 | 321 | mdecorde | boolean flagSp = false; |
300 | 321 | mdecorde | boolean flagAuthor = false; |
301 | 321 | mdecorde | boolean flagDate = false; |
302 | 321 | mdecorde | boolean flagWord = false; |
303 | 321 | mdecorde | boolean flagForm = false; |
304 | 321 | mdecorde | boolean flagAna = false; |
305 | 321 | mdecorde | |
306 | 321 | mdecorde | boolean flagchoice = false; |
307 | 321 | mdecorde | boolean flagcorr = false; |
308 | 321 | mdecorde | boolean flagsic = false; |
309 | 321 | mdecorde | boolean flagreg = false; |
310 | 321 | mdecorde | boolean flagexpan = false; |
311 | 321 | mdecorde | boolean flagorig = false; |
312 | 321 | mdecorde | boolean flagabbr = false; |
313 | 321 | mdecorde | boolean flagfw = false; |
314 | 321 | mdecorde | //boolean flagSupplied = false;
|
315 | 321 | mdecorde | int levelSupplied = 0; |
316 | 321 | mdecorde | //boolean flagSurplus = false;
|
317 | 321 | mdecorde | boolean flagForeign = false; |
318 | 321 | mdecorde | //boolean flagName = false;
|
319 | 321 | mdecorde | |
320 | 321 | mdecorde | this.GoToText();
|
321 | 321 | mdecorde | int missingId= 0 |
322 | 321 | mdecorde | boolean USEVERSE = false; // switch default reference to verse references |
323 | 321 | mdecorde | String titreId; // the title to use in the reference |
324 | 321 | mdecorde | |
325 | 321 | mdecorde | for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) { |
326 | 321 | mdecorde | switch (event) {
|
327 | 321 | mdecorde | case XMLStreamConstants.START_ELEMENT:
|
328 | 321 | mdecorde | |
329 | 321 | mdecorde | String localname = parser.getLocalName();
|
330 | 321 | mdecorde | if (foundtext) sattrsListener.startElement(localname);
|
331 | 321 | mdecorde | |
332 | 321 | mdecorde | switch (localname) {
|
333 | 321 | mdecorde | case "TEI": |
334 | 321 | mdecorde | foundtei = true;
|
335 | 321 | mdecorde | break;
|
336 | 321 | mdecorde | case "text": |
337 | 321 | mdecorde | foundtext = true;
|
338 | 321 | mdecorde | sattrsListener.startElement(localname); |
339 | 321 | mdecorde | output.write("<text id=\""+text+"\"") |
340 | 321 | mdecorde | |
341 | 321 | mdecorde | for (int i = 0; i < parser.getAttributeCount() ; i++) { |
342 | 321 | mdecorde | String name = parser.getAttributeLocalName(i);
|
343 | 321 | mdecorde | if ("id" == name || "base" == name || "project" == name) continue; |
344 | 321 | mdecorde | output.write(" "+name+"=\""+parser.getAttributeValue(i)+"\""); |
345 | 321 | mdecorde | |
346 | 321 | mdecorde | if (name == "forme") { |
347 | 321 | mdecorde | USEVERSE = (parser.getAttributeValue(i).contains("vers"))
|
348 | 321 | mdecorde | } else if (name == "sigle") { |
349 | 321 | mdecorde | titreId = parser.getAttributeValue(i) |
350 | 321 | mdecorde | } |
351 | 321 | mdecorde | } |
352 | 321 | mdecorde | |
353 | 321 | mdecorde | output.write(" base=\""+base+"\" project=\""+project+"\">\n"); |
354 | 321 | mdecorde | captureword=true;
|
355 | 321 | mdecorde | break;
|
356 | 321 | mdecorde | |
357 | 321 | mdecorde | case "div": |
358 | 321 | mdecorde | //output.write("<div type=\""+parser.getAttributeValue(null,"type")+"\">\n");
|
359 | 321 | mdecorde | String divType = "NA"; |
360 | 321 | mdecorde | String divSubtype = "NA"; |
361 | 321 | mdecorde | String divN = "NA"; |
362 | 321 | mdecorde | String divId ="NA"; |
363 | 321 | mdecorde | for(int i = 0 ; i < parser.getAttributeCount(); i++) { |
364 | 321 | mdecorde | if(parser.getAttributeLocalName(i) == "type") { |
365 | 321 | mdecorde | divType = parser.getAttributeValue(i); |
366 | 321 | mdecorde | } else if(parser.getAttributeLocalName(i) == "subtype") { |
367 | 321 | mdecorde | divSubtype = parser.getAttributeValue(i); |
368 | 321 | mdecorde | } else if(parser.getAttributeLocalName(i) == "n") { |
369 | 321 | mdecorde | divN = parser.getAttributeValue(i); |
370 | 321 | mdecorde | } else if(parser.getAttributeLocalName(i) == "id") { |
371 | 321 | mdecorde | divId = parser.getAttributeValue(i); |
372 | 321 | mdecorde | break;
|
373 | 321 | mdecorde | } |
374 | 321 | mdecorde | } |
375 | 321 | mdecorde | output.write("<div type=\""+divType+"\" subtype=\""+divSubtype+"\" n=\""+divN+"\" id=\""+divId+"\">\n"); |
376 | 321 | mdecorde | break;
|
377 | 321 | mdecorde | case "p": |
378 | 321 | mdecorde | p_id = increment(parser, p_id); |
379 | 321 | mdecorde | output.write("<p n=\""+p_id+"\">\n"); |
380 | 321 | mdecorde | break;
|
381 | 321 | mdecorde | case "ab": |
382 | 321 | mdecorde | ab_id = increment(parser, ab_id) |
383 | 321 | mdecorde | output.write("<ab n=\""+(ab_id)+"\" type=\""+parser.getAttributeValue(null,"type")+"\" subtype=\""+parser.getAttributeValue(null,"subtype")+"\" rend=\""+parser.getAttributeValue(null,"rend")+"\">\n"); |
384 | 321 | mdecorde | break;
|
385 | 321 | mdecorde | case "q": |
386 | 321 | mdecorde | q_id = increment(parser, q_id) |
387 | 321 | mdecorde | output.write("<q n=\""+(q_id)+"\">\n"); |
388 | 321 | mdecorde | //flagq=true;
|
389 | 321 | mdecorde | levelq = levelq + 1;
|
390 | 321 | mdecorde | break;
|
391 | 321 | mdecorde | case "sp": |
392 | 321 | mdecorde | sp_id = increment(parser, sp_id) |
393 | 321 | mdecorde | output.write("<sp n=\""+(sp_id)+"\">\n"); |
394 | 321 | mdecorde | flagSp = true;
|
395 | 321 | mdecorde | break;
|
396 | 321 | mdecorde | case "front": |
397 | 321 | mdecorde | front_id = increment(parser, front_id) |
398 | 321 | mdecorde | output.write("<front n=\""+front_id+"\">\n"); |
399 | 321 | mdecorde | break;
|
400 | 321 | mdecorde | case "body": |
401 | 321 | mdecorde | body_id= increment(parser, body_id) |
402 | 321 | mdecorde | output.write("<body n=\""+body_id+"\">\n"); |
403 | 321 | mdecorde | break;
|
404 | 321 | mdecorde | case "back": |
405 | 321 | mdecorde | back_id = increment(parser, back_id) |
406 | 321 | mdecorde | output.write("<back n=\""+back_id+"\">\n"); |
407 | 321 | mdecorde | break;
|
408 | 321 | mdecorde | case "lb": |
409 | 321 | mdecorde | lb_id = increment(parser, lb_id) |
410 | 321 | mdecorde | break;
|
411 | 321 | mdecorde | case "pb": |
412 | 321 | mdecorde | pb_id = increment(parser, pb_id) |
413 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
414 | 321 | mdecorde | if (parser.getAttributeLocalName(i) == "n") { |
415 | 321 | mdecorde | pb_n = parser.getAttributeValue(i); |
416 | 321 | mdecorde | } |
417 | 321 | mdecorde | } |
418 | 321 | mdecorde | break;
|
419 | 321 | mdecorde | case "s": |
420 | 321 | mdecorde | s_id = increment(parser, s_id) |
421 | 321 | mdecorde | output.write("<s n=\""+s_id+"\">\n"); |
422 | 321 | mdecorde | break;
|
423 | 321 | mdecorde | case "choice": |
424 | 321 | mdecorde | flagchoice = true;
|
425 | 321 | mdecorde | break;
|
426 | 321 | mdecorde | case "corr": |
427 | 321 | mdecorde | flagcorr = true;
|
428 | 321 | mdecorde | vCorr= "";
|
429 | 321 | mdecorde | break;
|
430 | 321 | mdecorde | case "reg": |
431 | 321 | mdecorde | flagreg = true;
|
432 | 321 | mdecorde | vReg= "";
|
433 | 321 | mdecorde | break;
|
434 | 321 | mdecorde | case "expan": |
435 | 321 | mdecorde | flagexpan = true;
|
436 | 321 | mdecorde | vExpan= "";
|
437 | 321 | mdecorde | break;
|
438 | 321 | mdecorde | case "orig": |
439 | 321 | mdecorde | flagreg = true;
|
440 | 321 | mdecorde | vOrig= "";
|
441 | 321 | mdecorde | break;
|
442 | 321 | mdecorde | case "sic": |
443 | 321 | mdecorde | flagsic = true;
|
444 | 321 | mdecorde | vSic= "";
|
445 | 321 | mdecorde | break;
|
446 | 321 | mdecorde | case "abbr": |
447 | 321 | mdecorde | flagreg = true;
|
448 | 321 | mdecorde | vAbbr= "";
|
449 | 321 | mdecorde | break;
|
450 | 321 | mdecorde | case "foreign": |
451 | 321 | mdecorde | flagForeign = true;
|
452 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
453 | 321 | mdecorde | if (parser.getAttributeLocalName(i) == "lang") { |
454 | 321 | mdecorde | lang = parser.getAttributeValue(i); |
455 | 321 | mdecorde | break;
|
456 | 321 | mdecorde | } |
457 | 321 | mdecorde | } |
458 | 321 | mdecorde | |
459 | 321 | mdecorde | output.write("<foreign n=\""+(foreign_id++)+"\" lang=\""+lang+"\">\n"); |
460 | 321 | mdecorde | //vForeign = "";
|
461 | 321 | mdecorde | break;
|
462 | 321 | mdecorde | |
463 | 321 | mdecorde | case "name": |
464 | 321 | mdecorde | //flagName = true;
|
465 | 321 | mdecorde | for(int i = 0 ; i < parser.getAttributeCount(); i++) |
466 | 321 | mdecorde | if(parser.getAttributeLocalName(i) == "type") |
467 | 321 | mdecorde | { |
468 | 321 | mdecorde | nameType = parser.getAttributeValue(i); |
469 | 321 | mdecorde | break;
|
470 | 321 | mdecorde | } |
471 | 321 | mdecorde | |
472 | 321 | mdecorde | output.write("<name n=\""+(name_id++)+"\" type=\""+nameType+"\">\n"); |
473 | 321 | mdecorde | break;
|
474 | 321 | mdecorde | case "supplied": |
475 | 321 | mdecorde | //flagSupplied = true;
|
476 | 321 | mdecorde | levelSupplied = levelSupplied + 1;
|
477 | 321 | mdecorde | break;
|
478 | 321 | mdecorde | |
479 | 321 | mdecorde | case "surplus": |
480 | 321 | mdecorde | flagfw = true;
|
481 | 321 | mdecorde | break;
|
482 | 321 | mdecorde | |
483 | 321 | mdecorde | case "del": |
484 | 321 | mdecorde | flagfw = true;
|
485 | 321 | mdecorde | break;
|
486 | 321 | mdecorde | |
487 | 321 | mdecorde | case "w": |
488 | 321 | mdecorde | givenpos = "";
|
489 | 321 | mdecorde | wordid = "w_"+text+"_m"+missingId++ |
490 | 321 | mdecorde | for (int i = 0 ; i < parser.getAttributeCount(); i++) { |
491 | 321 | mdecorde | if (parser.getAttributeLocalName(i) == "id") { |
492 | 321 | mdecorde | wordid = parser.getAttributeValue(i); |
493 | 321 | mdecorde | } else if (parser.getAttributeLocalName(i) == "type") { |
494 | 321 | mdecorde | givenpos = parser.getAttributeValue(i); |
495 | 321 | mdecorde | } |
496 | 321 | mdecorde | } |
497 | 321 | mdecorde | if (wordid.startsWith("w")) { |
498 | 321 | mdecorde | if (!wordid.startsWith("w_")) |
499 | 321 | mdecorde | wordid = "w_"+wordid.substring(1) |
500 | 321 | mdecorde | } else {
|
501 | 321 | mdecorde | wordid = "w_"+wordid;
|
502 | 321 | mdecorde | } |
503 | 321 | mdecorde | |
504 | 321 | mdecorde | if (givenpos == null || givenpos == "") |
505 | 321 | mdecorde | givenpos = "NA";
|
506 | 321 | mdecorde | vForm = "";
|
507 | 321 | mdecorde | anahash.clear(); // remove previous word ana values
|
508 | 321 | mdecorde | flagWord = true;
|
509 | 321 | mdecorde | break;
|
510 | 321 | mdecorde | case "form": |
511 | 321 | mdecorde | flagForm = true;
|
512 | 321 | mdecorde | vForm = "";
|
513 | 321 | mdecorde | break;
|
514 | 321 | mdecorde | |
515 | 321 | mdecorde | case "ana": |
516 | 321 | mdecorde | flagAna = true;
|
517 | 321 | mdecorde | anaType = parser.getAttributeValue(null, "type") |
518 | 321 | mdecorde | anahash.put(anaType, "");
|
519 | 321 | mdecorde | if (firstWord) {
|
520 | 321 | mdecorde | anaTypes << anaType; |
521 | 321 | mdecorde | } |
522 | 321 | mdecorde | break;
|
523 | 321 | mdecorde | } |
524 | 321 | mdecorde | break;
|
525 | 321 | mdecorde | |
526 | 321 | mdecorde | case XMLStreamConstants.END_ELEMENT:
|
527 | 321 | mdecorde | String localname = parser.getLocalName();
|
528 | 321 | mdecorde | if (foundtext) sattrsListener.endElement(localname);
|
529 | 321 | mdecorde | |
530 | 321 | mdecorde | switch (localname) {
|
531 | 321 | mdecorde | case "div": |
532 | 321 | mdecorde | output.write("</div>\n");
|
533 | 321 | mdecorde | break;
|
534 | 321 | mdecorde | case "text": |
535 | 321 | mdecorde | output.write("</text>\n");
|
536 | 321 | mdecorde | captureword=false;
|
537 | 321 | mdecorde | break;
|
538 | 321 | mdecorde | case "p": |
539 | 321 | mdecorde | output.write("</p>\n");
|
540 | 321 | mdecorde | break;
|
541 | 321 | mdecorde | case "s": |
542 | 321 | mdecorde | output.write("</s>\n");
|
543 | 321 | mdecorde | break;
|
544 | 321 | mdecorde | case "ab": |
545 | 321 | mdecorde | output.write("</ab>\n");
|
546 | 321 | mdecorde | break;
|
547 | 321 | mdecorde | case "q": |
548 | 321 | mdecorde | output.write("</q>\n");
|
549 | 321 | mdecorde | //flagq= false;
|
550 | 321 | mdecorde | levelq = levelq - 1;
|
551 | 321 | mdecorde | break;
|
552 | 321 | mdecorde | case "sp": |
553 | 321 | mdecorde | output.write("</sp>\n");
|
554 | 321 | mdecorde | flagSp = false;
|
555 | 321 | mdecorde | break;
|
556 | 321 | mdecorde | case "front": |
557 | 321 | mdecorde | output.write("</front>\n");
|
558 | 321 | mdecorde | break;
|
559 | 321 | mdecorde | case "body": |
560 | 321 | mdecorde | output.write("</body>\n");
|
561 | 321 | mdecorde | break;
|
562 | 321 | mdecorde | case "back": |
563 | 321 | mdecorde | output.write("</back>\n");
|
564 | 321 | mdecorde | break;
|
565 | 321 | mdecorde | |
566 | 321 | mdecorde | // case "fw":
|
567 | 321 | mdecorde | // flagfw = false;
|
568 | 321 | mdecorde | // break;
|
569 | 321 | mdecorde | |
570 | 321 | mdecorde | case "choice": |
571 | 321 | mdecorde | if(vOrig == "") |
572 | 321 | mdecorde | vOrig="NA";
|
573 | 321 | mdecorde | if(vSic == "") |
574 | 321 | mdecorde | vSic="NA";
|
575 | 321 | mdecorde | if(vAbbr == "") |
576 | 321 | mdecorde | vAbbr="NA";
|
577 | 321 | mdecorde | |
578 | 321 | mdecorde | String ref;
|
579 | 321 | mdecorde | if(USEVERSE)
|
580 | 321 | mdecorde | ref = titreId+", p."+pb_n+", v."+lb_id; |
581 | 321 | mdecorde | else
|
582 | 321 | mdecorde | ref = titreId+", p."+pb_n;
|
583 | 321 | mdecorde | |
584 | 321 | mdecorde | if (flagfw) {
|
585 | 321 | mdecorde | // on est hors texte
|
586 | 321 | mdecorde | } else {
|
587 | 321 | mdecorde | String vFormToWrite = vForm;
|
588 | 321 | mdecorde | if (vCorr != "") { |
589 | 321 | mdecorde | vFormToWrite = vCorr; |
590 | 321 | mdecorde | } else if(vReg != "") { |
591 | 321 | mdecorde | vFormToWrite = vReg; |
592 | 321 | mdecorde | } else if(vExpan != "") { |
593 | 321 | mdecorde | vFormToWrite = vExpan |
594 | 321 | mdecorde | } |
595 | 321 | mdecorde | firstWord = false;
|
596 | 321 | mdecorde | output.write( vFormToWrite +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+ |
597 | 321 | mdecorde | "\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType); |
598 | 321 | mdecorde | for(String type : anaTypes) { |
599 | 321 | mdecorde | output.write("\t"+anahash.get(type));
|
600 | 321 | mdecorde | } |
601 | 321 | mdecorde | output.write("\n")
|
602 | 321 | mdecorde | } |
603 | 321 | mdecorde | flagchoice = false;
|
604 | 321 | mdecorde | vCorr= "";
|
605 | 321 | mdecorde | vSic= "";
|
606 | 321 | mdecorde | break;
|
607 | 321 | mdecorde | case "corr": |
608 | 321 | mdecorde | flagcorr = false;
|
609 | 321 | mdecorde | |
610 | 321 | mdecorde | break;
|
611 | 321 | mdecorde | case "reg": |
612 | 321 | mdecorde | flagreg = false;
|
613 | 321 | mdecorde | vReg = "";
|
614 | 321 | mdecorde | break;
|
615 | 321 | mdecorde | case "expan": |
616 | 321 | mdecorde | flagexpan = false;
|
617 | 321 | mdecorde | vExpan= "";
|
618 | 321 | mdecorde | break;
|
619 | 321 | mdecorde | case "orig": |
620 | 321 | mdecorde | flagreg = false;
|
621 | 321 | mdecorde | vOrig= "";
|
622 | 321 | mdecorde | break;
|
623 | 321 | mdecorde | case "sic": |
624 | 321 | mdecorde | flagsic = false;
|
625 | 321 | mdecorde | |
626 | 321 | mdecorde | break;
|
627 | 321 | mdecorde | case "abbr": |
628 | 321 | mdecorde | flagreg = false;
|
629 | 321 | mdecorde | vAbbr= "";
|
630 | 321 | mdecorde | break;
|
631 | 321 | mdecorde | |
632 | 321 | mdecorde | case "foreign": |
633 | 321 | mdecorde | flagForeign = false;
|
634 | 321 | mdecorde | lang = "";
|
635 | 321 | mdecorde | output.write("</foreign>\n");
|
636 | 321 | mdecorde | break;
|
637 | 321 | mdecorde | |
638 | 321 | mdecorde | case "name": |
639 | 321 | mdecorde | //flagName = false;
|
640 | 321 | mdecorde | nameType = "";
|
641 | 321 | mdecorde | output.write("</name>\n");
|
642 | 321 | mdecorde | break;
|
643 | 321 | mdecorde | |
644 | 321 | mdecorde | case "supplied": |
645 | 321 | mdecorde | //flagSupplied = false;
|
646 | 321 | mdecorde | levelSupplied = levelSupplied - 1;
|
647 | 321 | mdecorde | break;
|
648 | 321 | mdecorde | |
649 | 321 | mdecorde | case "surplus": |
650 | 321 | mdecorde | flagfw = false;
|
651 | 321 | mdecorde | break;
|
652 | 321 | mdecorde | |
653 | 321 | mdecorde | case "del": |
654 | 321 | mdecorde | flagfw = false;
|
655 | 321 | mdecorde | break;
|
656 | 321 | mdecorde | |
657 | 321 | mdecorde | case "w": |
658 | 321 | mdecorde | if (captureword) {
|
659 | 321 | mdecorde | if (flagchoice) {
|
660 | 321 | mdecorde | |
661 | 321 | mdecorde | } else if(flagfw) { |
662 | 321 | mdecorde | |
663 | 321 | mdecorde | } else {
|
664 | 321 | mdecorde | if (vOrig == "") |
665 | 321 | mdecorde | vOrig="NA";
|
666 | 321 | mdecorde | if(vSic == "") |
667 | 321 | mdecorde | vSic="NA";
|
668 | 321 | mdecorde | if(vAbbr == "") |
669 | 321 | mdecorde | vAbbr="NA";
|
670 | 321 | mdecorde | if (nameType == "") |
671 | 321 | mdecorde | nameType = "NA";
|
672 | 321 | mdecorde | if(lang == "") |
673 | 321 | mdecorde | lang="fr"
|
674 | 321 | mdecorde | |
675 | 321 | mdecorde | String ref;
|
676 | 321 | mdecorde | if(USEVERSE)
|
677 | 321 | mdecorde | ref = titreId+", p."+pb_n+", v."+lb_id; |
678 | 321 | mdecorde | else
|
679 | 321 | mdecorde | ref = titreId+", p."+pb_n;
|
680 | 321 | mdecorde | |
681 | 321 | mdecorde | firstWord = false;
|
682 | 321 | mdecorde | output.write(vForm.replaceAll("&", "&").replaceAll("<", "<") +"\t"+wordid+"\t"+levelq.toString().substring(0,1)+"\t"+flagSp.toString().substring(0,1)+"\t"+pb_n+"\t"+lb_id+"\t"+vOrig+"\t"+vSic+"\t"+vAbbr+"\t"+ref+"\t"+givenpos+"\t"+levelSupplied.toString().substring(0,1)+"\t"+lang+"\t"+nameType); |
683 | 321 | mdecorde | for(String type : anaTypes) { |
684 | 321 | mdecorde | output.write("\t"+anahash.get(type));
|
685 | 321 | mdecorde | } |
686 | 321 | mdecorde | output.write("\n")
|
687 | 321 | mdecorde | } |
688 | 321 | mdecorde | |
689 | 321 | mdecorde | flagWord = false;
|
690 | 321 | mdecorde | } |
691 | 321 | mdecorde | break;
|
692 | 321 | mdecorde | |
693 | 321 | mdecorde | case "form": |
694 | 321 | mdecorde | flagForm = false;
|
695 | 321 | mdecorde | break;
|
696 | 321 | mdecorde | |
697 | 321 | mdecorde | case "ana": |
698 | 321 | mdecorde | flagAna = false;
|
699 | 321 | mdecorde | anahash.put(anaType, vAna); |
700 | 321 | mdecorde | vAna = "";
|
701 | 321 | mdecorde | break;
|
702 | 321 | mdecorde | } |
703 | 321 | mdecorde | break; // end elem |
704 | 321 | mdecorde | |
705 | 321 | mdecorde | case XMLStreamConstants.CHARACTERS:
|
706 | 321 | mdecorde | if (flagAna) {
|
707 | 321 | mdecorde | vAna += parser.getText().trim() |
708 | 321 | mdecorde | } |
709 | 321 | mdecorde | |
710 | 321 | mdecorde | if (flagForm) {
|
711 | 321 | mdecorde | vForm += parser.getText().trim(); |
712 | 321 | mdecorde | if (flagchoice) {
|
713 | 321 | mdecorde | if (flagsic) {
|
714 | 321 | mdecorde | vSic += parser.getText().trim(); |
715 | 321 | mdecorde | } |
716 | 321 | mdecorde | if (flagorig) {
|
717 | 321 | mdecorde | vOrig += parser.getText().trim(); |
718 | 321 | mdecorde | } |
719 | 321 | mdecorde | if (flagabbr) {
|
720 | 321 | mdecorde | vAbbr += parser.getText().trim(); |
721 | 321 | mdecorde | } |
722 | 321 | mdecorde | if (flagcorr) {
|
723 | 321 | mdecorde | vCorr += parser.getText().trim(); |
724 | 321 | mdecorde | } |
725 | 321 | mdecorde | } |
726 | 321 | mdecorde | } |
727 | 321 | mdecorde | } |
728 | 321 | mdecorde | } |
729 | 321 | mdecorde | //output.write("</txmcorpus>");
|
730 | 321 | mdecorde | output.close(); |
731 | 321 | mdecorde | parser.close(); |
732 | 321 | mdecorde | inputData.close(); |
733 | 321 | mdecorde | } catch (Exception ex) { |
734 | 321 | mdecorde | System.out.println("Exception while parsing " + inputData); |
735 | 321 | mdecorde | ex.printStackTrace(); |
736 | 321 | mdecorde | return false; |
737 | 321 | mdecorde | } |
738 | 321 | mdecorde | |
739 | 321 | mdecorde | return true; |
740 | 321 | mdecorde | } |
741 | 321 | mdecorde | |
742 | 321 | mdecorde | |
743 | 321 | mdecorde | /**
|
744 | 321 | mdecorde | * Run.
|
745 | 321 | mdecorde | *
|
746 | 321 | mdecorde | * @param rootDirFile contains the TEI-TXM files
|
747 | 321 | mdecorde | * @param basename the basename
|
748 | 321 | mdecorde | * @return true, if successful
|
749 | 321 | mdecorde | */
|
750 | 321 | mdecorde | public boolean run(File binDir, File txmDir, String corpusname, Properties metadataXPath) |
751 | 321 | mdecorde | { |
752 | 321 | mdecorde | sattrsListener = null; // reset SAttribute Listener for each new import |
753 | 321 | mdecorde | this.metadataXPath = metadataXPath;
|
754 | 321 | mdecorde | |
755 | 714 | mdecorde | if (!(CwbEncode.isExecutableAvailable() && CwbMakeAll.isExecutableAvailable())) {
|
756 | 714 | mdecorde | println ("Error: CWB executables not well set.")
|
757 | 321 | mdecorde | return false; |
758 | 321 | mdecorde | } |
759 | 321 | mdecorde | if (!txmDir.exists()) {
|
760 | 321 | mdecorde | println ("binary directory does not exists: "+txmDir)
|
761 | 321 | mdecorde | return false; |
762 | 321 | mdecorde | } |
763 | 321 | mdecorde | |
764 | 803 | mdecorde | File cqpFile = new File(binDir, "cqp/${corpusname}.cqp"); |
765 | 803 | mdecorde | new File(binDir, "/cqp/").deleteDir(); |
766 | 803 | mdecorde | new File(binDir, "/cqp/").mkdir(); |
767 | 321 | mdecorde | new File(binDir, "/data/${corpusname}").deleteDir(); |
768 | 321 | mdecorde | new File(binDir, "/data/${corpusname}").mkdir(); |
769 | 321 | mdecorde | new File(binDir, "registry/").mkdir(); |
770 | 321 | mdecorde | |
771 | 321 | mdecorde | String textid = ""; |
772 | 321 | mdecorde | int counttext = 0; |
773 | 321 | mdecorde | List<File> files = txmDir.listFiles(); |
774 | 321 | mdecorde | |
775 | 321 | mdecorde | // get text siecles to be able to sort with it
|
776 | 321 | mdecorde | HashMap<File,Integer[]> filesiecle = new HashMap<File, Integer[]>() |
777 | 321 | mdecorde | for (File f : files) { |
778 | 321 | mdecorde | Integer[] date = new Integer[3]; |
779 | 321 | mdecorde | date[0] = date[1] = date[2] = 0; |
780 | 321 | mdecorde | String xpath = "//tei:TEI/tei:teiHeader/tei:profileDesc/tei:creation/tei:date[@type='compo']/@when" |
781 | 321 | mdecorde | if (metadataXPath.containsKey("datecompo")) |
782 | 321 | mdecorde | xpath = metadataXPath.get("datecompo")
|
783 | 321 | mdecorde | String datecompo = XPathResult.getXpathResponse(f, xpath);
|
784 | 321 | mdecorde | if (datecompo != null) { |
785 | 321 | mdecorde | //println f.getName()+" > "+datecompo
|
786 | 321 | mdecorde | String[] split = datecompo.split("-"); // yyyy-mm-dd |
787 | 321 | mdecorde | if (split.length == 3) { |
788 | 321 | mdecorde | date[0] = Integer.parseInt(split[0]); |
789 | 321 | mdecorde | date[1] = Integer.parseInt(split[1]); |
790 | 321 | mdecorde | date[2] = Integer.parseInt(split[2]); |
791 | 321 | mdecorde | } |
792 | 321 | mdecorde | else if (split.length == 1) { // yyyy |
793 | 321 | mdecorde | date[0] = Integer.parseInt(split[0]); |
794 | 321 | mdecorde | date[1] = 1; |
795 | 321 | mdecorde | date[2] = 1; |
796 | 321 | mdecorde | } |
797 | 321 | mdecorde | } |
798 | 321 | mdecorde | filesiecle.put(f, date); |
799 | 321 | mdecorde | } |
800 | 321 | mdecorde | //println "date compos: "+filesiecle
|
801 | 321 | mdecorde | Collections.sort(files); // Alpha order |
802 | 321 | mdecorde | Collections.sort(files, new Comparator<File>() { // Date order |
803 | 321 | mdecorde | @Override
|
804 | 321 | mdecorde | public int compare(File o1, File o2) { |
805 | 321 | mdecorde | Integer[] date1 = filesiecle.get(o1); |
806 | 321 | mdecorde | Integer[] date2 = filesiecle.get(o2); |
807 | 321 | mdecorde | if (date1[0] < date2[0]) { |
808 | 321 | mdecorde | return -1; |
809 | 321 | mdecorde | } else if(date1[0] > date2[0]) { |
810 | 321 | mdecorde | return 1; |
811 | 321 | mdecorde | } |
812 | 321 | mdecorde | |
813 | 321 | mdecorde | if (date1[1] < date2[1]) { |
814 | 321 | mdecorde | return -1; |
815 | 321 | mdecorde | } else if(date1[1] > date2[1]) { |
816 | 321 | mdecorde | return 1; |
817 | 321 | mdecorde | } |
818 | 321 | mdecorde | |
819 | 321 | mdecorde | if (date1[2] < date2[2]) { |
820 | 321 | mdecorde | return -1; |
821 | 321 | mdecorde | } else if(date1[2] > date2[2]) { |
822 | 321 | mdecorde | return 1; |
823 | 321 | mdecorde | } |
824 | 321 | mdecorde | |
825 | 321 | mdecorde | return 0; |
826 | 321 | mdecorde | } |
827 | 321 | mdecorde | }); |
828 | 321 | mdecorde | |
829 | 321 | mdecorde | this.orderedFiles = files;
|
830 | 321 | mdecorde | println("process "+files.size()+" files ") |
831 | 321 | mdecorde | //println("files: $files")
|
832 | 321 | mdecorde | //write txmcorpus
|
833 | 803 | mdecorde | if (!createOutput(cqpFile)) {
|
834 | 803 | mdecorde | println "Error: could not write cqp file"
|
835 | 321 | mdecorde | return false; |
836 | 321 | mdecorde | } else {
|
837 | 321 | mdecorde | output.write("<txmcorpus lang=\""+lang+"\">\n"); |
838 | 321 | mdecorde | output.close(); |
839 | 321 | mdecorde | } |
840 | 321 | mdecorde | |
841 | 804 | mdecorde | //1- Transform into CQP file
|
842 | 321 | mdecorde | for (File f : files) { |
843 | 321 | mdecorde | counttext++; |
844 | 321 | mdecorde | if (!f.exists()) {
|
845 | 321 | mdecorde | println("file "+f+ " does not exists") |
846 | 321 | mdecorde | } else {
|
847 | 321 | mdecorde | print "."
|
848 | 321 | mdecorde | String txtname = f.getName().substring(0, f.getName().length()-4); |
849 | 321 | mdecorde | def builder = new compiler(f.toURI().toURL(), txtname, corpusname, "default", metadataXPath); |
850 | 321 | mdecorde | builder.setLang(lang) |
851 | 804 | mdecorde | if (!builder.transfomFileCqp(cqpFile)) {
|
852 | 321 | mdecorde | println "Failed to compile "+f
|
853 | 321 | mdecorde | } |
854 | 321 | mdecorde | builder.setAnnotationDone(this.annotate_status);
|
855 | 321 | mdecorde | } |
856 | 321 | mdecorde | } |
857 | 321 | mdecorde | |
858 | 321 | mdecorde | //close txmcorpus
|
859 | 803 | mdecorde | if (!createOutput(cqpFile)) {
|
860 | 803 | mdecorde | println "Error: could not write cqp file"
|
861 | 321 | mdecorde | return false; |
862 | 321 | mdecorde | } else {
|
863 | 321 | mdecorde | output.write("</txmcorpus>\n");
|
864 | 321 | mdecorde | output.close(); |
865 | 321 | mdecorde | } |
866 | 321 | mdecorde | println ""
|
867 | 321 | mdecorde | |
868 | 321 | mdecorde | //2- Import into CWB
|
869 | 321 | mdecorde | def outDir = binDir.getAbsolutePath();;
|
870 | 321 | mdecorde | CwbEncode cwbEn = new CwbEncode();
|
871 | 321 | mdecorde | cwbEn.setDebug(debug); |
872 | 321 | mdecorde | CwbMakeAll cwbMa = new CwbMakeAll();
|
873 | 321 | mdecorde | cwbMa.setDebug(debug); |
874 | 321 | mdecorde | |
875 | 321 | mdecorde | def pAttrs = ["id","q","sp","pb","lb","orig","sic","abbr","ref","pos","supplied","lang","nametype"]; |
876 | 321 | mdecorde | for(String type : anaTypes) |
877 | 321 | mdecorde | pAttrs.add(type.substring(1)); // remove # |
878 | 321 | mdecorde | |
879 | 321 | mdecorde | structs = sattrsListener.getStructs(); |
880 | 321 | mdecorde | structsProf = sattrsListener.getProfs(); |
881 | 321 | mdecorde | if (debug) {
|
882 | 321 | mdecorde | println structs |
883 | 321 | mdecorde | println structsProf |
884 | 321 | mdecorde | } |
885 | 321 | mdecorde | // add structures+properties found in sources
|
886 | 321 | mdecorde | List<String> sargs = new ArrayList<String>(); |
887 | 321 | mdecorde | for (String name : structs.keySet()) { |
888 | 321 | mdecorde | if ( name == "text") continue; // added after |
889 | 321 | mdecorde | //if ( name == "q") continue; // added after
|
890 | 321 | mdecorde | //if ( name == "foreign") continue; // added after
|
891 | 321 | mdecorde | String concat = name+":"+structsProf.get(name); // append the depth |
892 | 321 | mdecorde | for (String value : structs.get(name)) // append the attributes |
893 | 321 | mdecorde | concat += "+"+value;
|
894 | 321 | mdecorde | if ((name == "p" || name == "body" || name == "back" || name == "front") && |
895 | 321 | mdecorde | !(concat.endsWith("+n") || concat.contains("+n+"))) |
896 | 321 | mdecorde | concat += "+n"
|
897 | 321 | mdecorde | sargs.add(concat); |
898 | 321 | mdecorde | } |
899 | 321 | mdecorde | |
900 | 321 | mdecorde | String textSAttributes = "text:0+id+base+project"; |
901 | 321 | mdecorde | if (metadataXPath != null) { |
902 | 321 | mdecorde | for (String meta : metadataXPath.keySet()) // text property declarations from metadata.csv |
903 | 321 | mdecorde | textSAttributes+="+"+meta;
|
904 | 321 | mdecorde | } |
905 | 321 | mdecorde | if (!metadataXPath.keySet().contains("sigle")) |
906 | 321 | mdecorde | textSAttributes+="+sigle";
|
907 | 321 | mdecorde | |
908 | 321 | mdecorde | sargs.add(textSAttributes) |
909 | 321 | mdecorde | sargs.add("txmcorpus:0+lang")
|
910 | 321 | mdecorde | //sargs.add("q:0+n+lang")
|
911 | 321 | mdecorde | for (int c = 0 ; c < sargs.size() ; c++) { |
912 | 321 | mdecorde | String sarg = sargs.get(c);
|
913 | 321 | mdecorde | if (sarg.startsWith("q:")) { |
914 | 321 | mdecorde | if (! sarg.contains("+n")) sarg +="+n" |
915 | 321 | mdecorde | if (! sarg.contains("+lang")) sarg +="+lang" |
916 | 321 | mdecorde | |
917 | 321 | mdecorde | sargs.set(c, sarg); |
918 | 321 | mdecorde | } else if(sarg.startsWith("foreign:")) { |
919 | 321 | mdecorde | if (! sarg.contains("+n")) sarg +="+n" |
920 | 321 | mdecorde | if (! sarg.contains("+lang")) sarg +="+lang" |
921 | 321 | mdecorde | sargs.set(c, sarg); |
922 | 321 | mdecorde | } else if(sarg.startsWith("ab:") || sarg.startsWith("sp:")) { |
923 | 321 | mdecorde | if (! sarg.contains("+n")) sarg +="+n" |
924 | 321 | mdecorde | if (! sarg.contains("+subtype")) sarg +="+subtype" |
925 | 321 | mdecorde | if (! sarg.contains("+rend")) sarg +="+rend" |
926 | 321 | mdecorde | sargs.set(c, sarg); |
927 | 321 | mdecorde | } else if(sarg.startsWith("div:")) { |
928 | 321 | mdecorde | if (! sarg.contains("+n")) sarg +="+n" |
929 | 321 | mdecorde | if (! sarg.contains("+id")) sarg +="+id" |
930 | 321 | mdecorde | if (! sarg.contains("+type")) sarg +="+type" |
931 | 321 | mdecorde | if (! sarg.contains("+subtype")) sarg +="+subtype" |
932 | 321 | mdecorde | sargs.set(c, sarg); |
933 | 321 | mdecorde | } else if(sarg.startsWith("name:")) { |
934 | 321 | mdecorde | if (! sarg.contains("+n")) sarg +="+n" |
935 | 321 | mdecorde | if (! sarg.contains("+type")) sarg +="+type" |
936 | 321 | mdecorde | sargs.set(c, sarg); |
937 | 321 | mdecorde | } |
938 | 321 | mdecorde | } |
939 | 321 | mdecorde | sargs.sort(); |
940 | 321 | mdecorde | |
941 | 321 | mdecorde | String[] sAttributes = sargs; |
942 | 321 | mdecorde | String[] pAttributes = pAttrs; |
943 | 321 | mdecorde | println "P-attributes: "+pAttributes
|
944 | 321 | mdecorde | println "S-attributes: "+sargs
|
945 | 321 | mdecorde | |
946 | 321 | mdecorde | try {
|
947 | 321 | mdecorde | String regPath = outDir + "/registry/"+corpusname.toLowerCase(); // CQP wants lower case registry files |
948 | 803 | mdecorde | cwbEn.run(outDir + "/data/${corpusname}", outDir + "/cqp/"+corpusname+".cqp", regPath,pAttributes, sAttributes); |
949 | 321 | mdecorde | if (!new File(regPath).exists()) { |
950 | 321 | mdecorde | println "Error: The registry file was not created: $regPath. See https://groupes.renater.fr/wiki/txm-users/public/faq"
|
951 | 321 | mdecorde | return false; |
952 | 321 | mdecorde | } |
953 | 714 | mdecorde | cwbMa.run(corpusname, outDir + "/registry");
|
954 | 321 | mdecorde | } catch (Exception ex) {System.out.println(ex);return false;} |
955 | 321 | mdecorde | |
956 | 321 | mdecorde | return true; |
957 | 321 | mdecorde | } |
958 | 321 | mdecorde | |
959 | 321 | mdecorde | /**
|
960 | 321 | mdecorde | * show cwb utils messages.
|
961 | 321 | mdecorde | */
|
962 | 321 | mdecorde | public void setDebug() |
963 | 321 | mdecorde | { |
964 | 321 | mdecorde | this.debug = true; |
965 | 321 | mdecorde | } |
966 | 321 | mdecorde | |
967 | 321 | mdecorde | /**
|
968 | 321 | mdecorde | * test purpose.
|
969 | 321 | mdecorde | *
|
970 | 321 | mdecorde | * @param args the arguments
|
971 | 321 | mdecorde | */
|
972 | 321 | mdecorde | public static void main(String[] args) |
973 | 321 | mdecorde | { |
974 | 321 | mdecorde | File dir = new File("~/xml/bfm"); |
975 | 321 | mdecorde | def c = new compiler(); |
976 | 321 | mdecorde | c.setDebug(); |
977 | 321 | mdecorde | c.setCwbPath("~/TXM/cwb/bin");
|
978 | 321 | mdecorde | c.run(dir); |
979 | 321 | mdecorde | } |
980 | 321 | mdecorde | } |