root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / Xml2Ana.groovy @ 1000
History | View | Annotate | Download (15.5 kB)
1 | 986 | mdecorde | // Copyright © 2010-2013 ENS de Lyon.
|
---|---|---|---|
2 | 986 | mdecorde | // Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
|
3 | 986 | mdecorde | // Lyon 2, University of Franche-Comté, University of Nice
|
4 | 986 | mdecorde | // Sophia Antipolis, University of Paris 3.
|
5 | 986 | mdecorde | //
|
6 | 986 | mdecorde | // The TXM platform is free software: you can redistribute it
|
7 | 986 | mdecorde | // and/or modify it under the terms of the GNU General Public
|
8 | 986 | mdecorde | // License as published by the Free Software Foundation,
|
9 | 986 | mdecorde | // either version 2 of the License, or (at your option) any
|
10 | 986 | mdecorde | // later version.
|
11 | 986 | mdecorde | //
|
12 | 986 | mdecorde | // The TXM platform is distributed in the hope that it will be
|
13 | 986 | mdecorde | // useful, but WITHOUT ANY WARRANTY; without even the implied
|
14 | 986 | mdecorde | // warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
15 | 986 | mdecorde | // PURPOSE. See the GNU General Public License for more
|
16 | 986 | mdecorde | // details.
|
17 | 986 | mdecorde | //
|
18 | 986 | mdecorde | // You should have received a copy of the GNU General
|
19 | 986 | mdecorde | // Public License along with the TXM platform. If not, see
|
20 | 986 | mdecorde | // http://www.gnu.org/licenses.
|
21 | 986 | mdecorde | //
|
22 | 986 | mdecorde | //
|
23 | 986 | mdecorde | //
|
24 | 986 | mdecorde | // $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $
|
25 | 986 | mdecorde | // $LastChangedRevision: 3430 $
|
26 | 986 | mdecorde | // $LastChangedBy: mdecorde $
|
27 | 986 | mdecorde | //
|
28 | 986 | mdecorde | package org.txm.importer.scripts.xmltxm
|
29 | 986 | mdecorde | |
30 | 986 | mdecorde | import org.txm.importer.StaxIdentityParser; |
31 | 986 | mdecorde | |
32 | 986 | mdecorde | import java.text.DateFormat; |
33 | 986 | mdecorde | import java.util.ArrayList; |
34 | 986 | mdecorde | import java.util.Date; |
35 | 986 | mdecorde | import java.util.HashMap; |
36 | 986 | mdecorde | import java.util.Locale; |
37 | 986 | mdecorde | |
38 | 986 | mdecorde | import javax.xml.stream.*; |
39 | 986 | mdecorde | |
40 | 986 | mdecorde | import java.net.URL; |
41 | 986 | mdecorde | |
42 | 986 | mdecorde | import org.txm.importer.filters.*; |
43 | 1000 | mdecorde | import org.txm.scripts.importer.HasElement |
44 | 1000 | mdecorde | import org.txm.scripts.importer.StaxStackWriter |
45 | 986 | mdecorde | import org.txm.utils.AsciiUtils; |
46 | 986 | mdecorde | |
47 | 986 | mdecorde | /**
|
48 | 986 | mdecorde | * The Class Xml2Ana.
|
49 | 986 | mdecorde | *
|
50 | 986 | mdecorde | * @author mdecorde
|
51 | 986 | mdecorde | * transform : pre xml-tei file >> xml-tei-txm file
|
52 | 986 | mdecorde | * The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt
|
53 | 986 | mdecorde | *
|
54 | 986 | mdecorde | * you must specify the correspondance between word attributs and ana types&respStmtIDs
|
55 | 986 | mdecorde | * then the attributes of w tags will be transformed into interp tag
|
56 | 986 | mdecorde | */
|
57 | 1000 | mdecorde | class Xml2Ana extends StaxIdentityParser |
58 | 986 | mdecorde | { |
59 | 986 | mdecorde | /** The dir. */
|
60 | 986 | mdecorde | private def dir; |
61 | 986 | mdecorde | |
62 | 986 | mdecorde | /** The convert all attributes. */
|
63 | 986 | mdecorde | private boolean convertAllAttributes = false; |
64 | 986 | mdecorde | |
65 | 986 | mdecorde | /** The corresp type. */
|
66 | 986 | mdecorde | HashMap<String,String> correspType; |
67 | 986 | mdecorde | |
68 | 986 | mdecorde | /** The corresp ref. */
|
69 | 986 | mdecorde | HashMap<String,String> correspRef; |
70 | 986 | mdecorde | |
71 | 986 | mdecorde | /** The check tags. */
|
72 | 986 | mdecorde | HashMap<String,Boolean> checkTags = new HashMap<String,Boolean>(); |
73 | 986 | mdecorde | |
74 | 986 | mdecorde | /** The resp id. */
|
75 | 986 | mdecorde | def respId = []; |
76 | 986 | mdecorde | |
77 | 986 | mdecorde | /** The applications. */
|
78 | 986 | mdecorde | HashMap<String,File> applications; |
79 | 986 | mdecorde | |
80 | 986 | mdecorde | /** The taxonomies. */
|
81 | 986 | mdecorde | HashMap<String,String[]> taxonomies; |
82 | 986 | mdecorde | |
83 | 986 | mdecorde | /** The resps. */
|
84 | 986 | mdecorde | HashMap<String,String[]> resps; |
85 | 986 | mdecorde | |
86 | 986 | mdecorde | /** The items. */
|
87 | 986 | mdecorde | HashMap<String,HashMap<String,String>> items; |
88 | 986 | mdecorde | |
89 | 986 | mdecorde | /** The XML headeradded. */
|
90 | 986 | mdecorde | boolean XMLHeaderadded = false; |
91 | 986 | mdecorde | String textname;
|
92 | 986 | mdecorde | String wtag = "w"; |
93 | 986 | mdecorde | |
94 | 986 | mdecorde | public static final String TEXT = "text" |
95 | 986 | mdecorde | public static final String ID = "id" |
96 | 986 | mdecorde | |
97 | 986 | mdecorde | /**
|
98 | 986 | mdecorde | * Instantiates a new xml2 ana.
|
99 | 986 | mdecorde | *
|
100 | 986 | mdecorde | * @param url the url
|
101 | 986 | mdecorde | * @param wordprefix the wordprefix
|
102 | 986 | mdecorde | */
|
103 | 986 | mdecorde | public Xml2Ana(File file) { |
104 | 986 | mdecorde | super(file.toURI().toURL());
|
105 | 986 | mdecorde | //File file = new File(url.getFile()).getAbsoluteFile()
|
106 | 986 | mdecorde | textname = file.getName(); |
107 | 986 | mdecorde | int idx = textname.lastIndexOf("."); |
108 | 986 | mdecorde | if (idx > 0) |
109 | 986 | mdecorde | textname = textname.substring(0, idx)
|
110 | 986 | mdecorde | |
111 | 986 | mdecorde | |
112 | 986 | mdecorde | checkTags.put("respStmt",false); |
113 | 986 | mdecorde | checkTags.put("titleStmt",false); |
114 | 986 | mdecorde | checkTags.put("appInfo",false); |
115 | 986 | mdecorde | |
116 | 986 | mdecorde | hasText = new HasElement(file, TEXT).process();
|
117 | 986 | mdecorde | } |
118 | 986 | mdecorde | |
119 | 986 | mdecorde | /**
|
120 | 986 | mdecorde | * Sets the convert all atrtibutes.
|
121 | 986 | mdecorde | *
|
122 | 986 | mdecorde | * @param value the value
|
123 | 986 | mdecorde | * @return the java.lang. object
|
124 | 986 | mdecorde | */
|
125 | 986 | mdecorde | public setConvertAllAtrtibutes(boolean value) { |
126 | 986 | mdecorde | convertAllAttributes = value; |
127 | 986 | mdecorde | } |
128 | 986 | mdecorde | |
129 | 986 | mdecorde | /**
|
130 | 986 | mdecorde | * Sets the convert all atrtibutes.
|
131 | 986 | mdecorde | *
|
132 | 986 | mdecorde | * @param value the value
|
133 | 986 | mdecorde | * @return the java.lang. object
|
134 | 986 | mdecorde | */
|
135 | 986 | mdecorde | public setWordTag(String wtag) { |
136 | 986 | mdecorde | this.wtag = wtag
|
137 | 986 | mdecorde | } |
138 | 986 | mdecorde | |
139 | 986 | mdecorde | int idcount = 0; |
140 | 986 | mdecorde | boolean flagWord = false; |
141 | 986 | mdecorde | int firstElement = 0; |
142 | 986 | mdecorde | boolean teiElementAdded = false; |
143 | 986 | mdecorde | boolean teiHeaderElementAdded = false; |
144 | 986 | mdecorde | boolean hasText = false; |
145 | 986 | mdecorde | boolean textElementAdded = false; |
146 | 986 | mdecorde | def anabalises = []; |
147 | 986 | mdecorde | protected void processStartElement() |
148 | 986 | mdecorde | { |
149 | 986 | mdecorde | // println "checkTags=$checkTags";
|
150 | 986 | mdecorde | // println "parser=$parser";
|
151 | 986 | mdecorde | firstElement++; |
152 | 986 | mdecorde | |
153 | 986 | mdecorde | if (this.checkTags.containsKey(parser.getLocalName())) { |
154 | 986 | mdecorde | this.checkTags.put(parser.getLocalName(), true); |
155 | 986 | mdecorde | } |
156 | 986 | mdecorde | |
157 | 986 | mdecorde | switch (parser.getLocalName()) {
|
158 | 986 | mdecorde | case wtag: |
159 | 986 | mdecorde | if (!hasText) {
|
160 | 986 | mdecorde | writer.writeStartElement(TEXT); |
161 | 986 | mdecorde | writer.writeAttribute(ID, textname); |
162 | 986 | mdecorde | textElementAdded = true;
|
163 | 986 | mdecorde | hasText = true;
|
164 | 986 | mdecorde | } |
165 | 986 | mdecorde | idcount++; // increment word counter
|
166 | 986 | mdecorde | anabalises.clear(); |
167 | 986 | mdecorde | |
168 | 986 | mdecorde | writer.writeStartElement(parser.getLocalName()); // write w
|
169 | 986 | mdecorde | |
170 | 986 | mdecorde | for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces |
171 | 986 | mdecorde | writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i)); |
172 | 986 | mdecorde | |
173 | 986 | mdecorde | for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes |
174 | 986 | mdecorde | String type = parser.getAttributeLocalName(i);
|
175 | 986 | mdecorde | String value = parser.getAttributeValue(i);
|
176 | 986 | mdecorde | if (correspType.containsKey(type)) { // check if txm:ana |
177 | 986 | mdecorde | String corresptype = correspType.get(type);
|
178 | 986 | mdecorde | String ref = correspRef.get(type);
|
179 | 986 | mdecorde | anabalises.add(["#"+ref, "#"+corresptype, value]); |
180 | 986 | mdecorde | } else if (type == ID) { // keep id attribute |
181 | 986 | mdecorde | String wordid = value
|
182 | 986 | mdecorde | if (wordid.startsWith("w")) { |
183 | 986 | mdecorde | if (!wordid.startsWith("w_")) |
184 | 986 | mdecorde | wordid = "w_"+wordid.substring(1) |
185 | 986 | mdecorde | } |
186 | 986 | mdecorde | // else {
|
187 | 986 | mdecorde | // wordid = "w_"+textname+"_"+wordid;
|
188 | 986 | mdecorde | // }
|
189 | 986 | mdecorde | |
190 | 986 | mdecorde | wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value
|
191 | 986 | mdecorde | |
192 | 986 | mdecorde | writer.writeAttribute(type, wordid); |
193 | 986 | mdecorde | |
194 | 986 | mdecorde | } else { // add attributes that was in the original <w> |
195 | 986 | mdecorde | if (convertAllAttributes)
|
196 | 986 | mdecorde | anabalises.add(["none","#"+type, value]) |
197 | 986 | mdecorde | else
|
198 | 986 | mdecorde | writer.writeAttribute(type, value); |
199 | 986 | mdecorde | } |
200 | 986 | mdecorde | } |
201 | 986 | mdecorde | |
202 | 986 | mdecorde | flagWord = true; // start to capture the form |
203 | 986 | mdecorde | writer.writeStartElement(TXMNS, "form");
|
204 | 986 | mdecorde | break;
|
205 | 986 | mdecorde | |
206 | 986 | mdecorde | case "TEI": |
207 | 986 | mdecorde | super.processStartElement();
|
208 | 986 | mdecorde | boolean hasTeiNS = false; |
209 | 986 | mdecorde | boolean hasTXMNs = false; |
210 | 986 | mdecorde | for (int i = 0 ; i < parser.getNamespaceCount() ; i++) { |
211 | 986 | mdecorde | if (parser.getNamespaceURI(i) == TXMNS)
|
212 | 986 | mdecorde | hasTXMNs = true;
|
213 | 986 | mdecorde | else if (parser.getNamespaceURI(i) == TEINS) |
214 | 986 | mdecorde | hasTeiNS = true;
|
215 | 986 | mdecorde | } |
216 | 986 | mdecorde | if (!hasTeiNS) {
|
217 | 986 | mdecorde | writer.writeDefaultNamespace(TEINS); |
218 | 986 | mdecorde | } |
219 | 986 | mdecorde | if (!hasTXMNs)
|
220 | 986 | mdecorde | writer.writeNamespace(TXM, TXMNS); |
221 | 986 | mdecorde | break;
|
222 | 986 | mdecorde | |
223 | 986 | mdecorde | default:
|
224 | 986 | mdecorde | |
225 | 986 | mdecorde | if (TEXT.equals(localname)) {
|
226 | 986 | mdecorde | hasText = true;
|
227 | 986 | mdecorde | } |
228 | 986 | mdecorde | |
229 | 986 | mdecorde | if (firstElement == 1) { // test if first element is TEI |
230 | 986 | mdecorde | //println "first tag: "+parser.getLocalName()
|
231 | 986 | mdecorde | if (localname != "TEI") { // "TEI" is missing |
232 | 986 | mdecorde | teiElementAdded = true;
|
233 | 986 | mdecorde | addTEIElement(); |
234 | 986 | mdecorde | } else if (!hasText) { |
235 | 986 | mdecorde | writer.writeStartElement(TEXT); |
236 | 986 | mdecorde | writer.writeAttribute(ID, textname); |
237 | 986 | mdecorde | textElementAdded = true;
|
238 | 986 | mdecorde | hasText = true;
|
239 | 986 | mdecorde | } |
240 | 986 | mdecorde | } |
241 | 986 | mdecorde | if (firstElement == 2 && teiElementAdded != true) { |
242 | 986 | mdecorde | //println "second tag: "+parser.getLocalName()
|
243 | 986 | mdecorde | if (localname != "teiHeader") { // teiHeader is missing |
244 | 986 | mdecorde | writeTeiHeader(); |
245 | 986 | mdecorde | hasTeiHeader = true
|
246 | 986 | mdecorde | teiHeaderElementAdded = true
|
247 | 986 | mdecorde | } |
248 | 986 | mdecorde | } else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) { |
249 | 986 | mdecorde | writer.writeStartElement(TEXT); |
250 | 986 | mdecorde | writer.writeAttribute(ID, textname); |
251 | 986 | mdecorde | textElementAdded = true;
|
252 | 986 | mdecorde | hasText = true;
|
253 | 986 | mdecorde | } |
254 | 986 | mdecorde | |
255 | 986 | mdecorde | super.processStartElement();
|
256 | 986 | mdecorde | if (TEXT.equals(localname)) {
|
257 | 986 | mdecorde | if (!parser.getAttributeValue(null, ID)) { |
258 | 986 | mdecorde | writer.writeAttribute(ID, textname); |
259 | 986 | mdecorde | } |
260 | 986 | mdecorde | } |
261 | 986 | mdecorde | } |
262 | 986 | mdecorde | } |
263 | 986 | mdecorde | |
264 | 986 | mdecorde | protected void after() |
265 | 986 | mdecorde | { |
266 | 986 | mdecorde | if (textElementAdded) {
|
267 | 986 | mdecorde | writer.writeEndElement(); // text
|
268 | 986 | mdecorde | } |
269 | 986 | mdecorde | if (teiElementAdded) {
|
270 | 986 | mdecorde | writer.writeEndElement(); // TEI
|
271 | 986 | mdecorde | } |
272 | 986 | mdecorde | super.after(); // close writer, parser, etc |
273 | 986 | mdecorde | } |
274 | 986 | mdecorde | |
275 | 986 | mdecorde | protected void addTEIElement() |
276 | 986 | mdecorde | { |
277 | 986 | mdecorde | writer.writeStartElement("TEI");
|
278 | 986 | mdecorde | writer.writeDefaultNamespace(TEINS); |
279 | 986 | mdecorde | writer.writeNamespace(TXM, TXMNS); |
280 | 986 | mdecorde | writer.writeNamespace(TEI, TEINS); |
281 | 986 | mdecorde | writeTeiHeader(); |
282 | 986 | mdecorde | } |
283 | 986 | mdecorde | |
284 | 986 | mdecorde | protected void processCharacters() |
285 | 986 | mdecorde | { |
286 | 986 | mdecorde | if (flagWord) {
|
287 | 986 | mdecorde | writer.writeCharacters(parser.getText().trim()); // keep form in 1 line
|
288 | 986 | mdecorde | } else {
|
289 | 986 | mdecorde | super.processCharacters();
|
290 | 986 | mdecorde | } |
291 | 986 | mdecorde | } |
292 | 986 | mdecorde | |
293 | 986 | mdecorde | boolean hasClassDecl = false; |
294 | 986 | mdecorde | boolean hasFileDesc = false; |
295 | 986 | mdecorde | boolean hasEncodingDesc = false; |
296 | 986 | mdecorde | boolean hasTeiHeader = false; |
297 | 986 | mdecorde | boolean hasTEI = false; |
298 | 986 | mdecorde | public static String ANA = "ana" |
299 | 986 | mdecorde | public static String RESP = "resp" |
300 | 986 | mdecorde | public static String TYPE = "type" |
301 | 986 | mdecorde | protected void processEndElement() |
302 | 986 | mdecorde | { |
303 | 986 | mdecorde | switch (parser.getLocalName()) {
|
304 | 986 | mdecorde | case wtag: |
305 | 986 | mdecorde | writer.writeEndElement(); // txm:form
|
306 | 986 | mdecorde | for (def values : anabalises) |
307 | 986 | mdecorde | {// <txm:ana resp=ref type=corresptype>value</txm:ana>
|
308 | 986 | mdecorde | writer.writeStartElement(TXMNS, ANA); |
309 | 986 | mdecorde | writer.writeAttribute(RESP, values[0]);
|
310 | 986 | mdecorde | writer.writeAttribute(TYPE, values[1]);
|
311 | 986 | mdecorde | writer.writeCharacters(values[2]);
|
312 | 986 | mdecorde | writer.writeEndElement(); // txm:ana
|
313 | 986 | mdecorde | } |
314 | 986 | mdecorde | |
315 | 986 | mdecorde | flagWord = false;
|
316 | 986 | mdecorde | break;
|
317 | 986 | mdecorde | |
318 | 986 | mdecorde | case "fileDesc": |
319 | 986 | mdecorde | hasFileDesc = true;
|
320 | 986 | mdecorde | this.writeTXMResps();
|
321 | 986 | mdecorde | break;
|
322 | 986 | mdecorde | |
323 | 986 | mdecorde | case "classDecl": |
324 | 986 | mdecorde | hasClassDecl=true;
|
325 | 986 | mdecorde | this.writeTXMTaxonomies();
|
326 | 986 | mdecorde | break;
|
327 | 986 | mdecorde | case "encodingDesc": |
328 | 986 | mdecorde | hasEncodingDesc = true;
|
329 | 986 | mdecorde | writeContentOfEncodingDesc(); |
330 | 986 | mdecorde | break;
|
331 | 986 | mdecorde | |
332 | 986 | mdecorde | case "teiHeader": |
333 | 986 | mdecorde | hasTeiHeader = true
|
334 | 986 | mdecorde | if (!hasEncodingDesc) {
|
335 | 986 | mdecorde | writer.writeStartElement("encodingDesc");
|
336 | 986 | mdecorde | writeContentOfEncodingDesc(); |
337 | 986 | mdecorde | writer.writeEndElement(); |
338 | 986 | mdecorde | } |
339 | 986 | mdecorde | |
340 | 986 | mdecorde | break;
|
341 | 986 | mdecorde | case "TEI": |
342 | 986 | mdecorde | hasTEI = true;
|
343 | 986 | mdecorde | if (!hasTeiHeader) {
|
344 | 986 | mdecorde | writeTeiHeader(); |
345 | 986 | mdecorde | } |
346 | 986 | mdecorde | break;
|
347 | 986 | mdecorde | } |
348 | 986 | mdecorde | |
349 | 986 | mdecorde | super.processEndElement();
|
350 | 986 | mdecorde | } |
351 | 986 | mdecorde | |
352 | 986 | mdecorde | protected void writeTeiHeader() |
353 | 986 | mdecorde | { |
354 | 986 | mdecorde | writer.writeStartElement("teiHeader");
|
355 | 986 | mdecorde | writer.writeStartElement("fileDesc")
|
356 | 986 | mdecorde | this.writeTXMResps();
|
357 | 986 | mdecorde | writer.writeStartElement("titleStmt")
|
358 | 986 | mdecorde | writer.writeStartElement("title")
|
359 | 986 | mdecorde | writer.writeEndElement(); // title
|
360 | 986 | mdecorde | writer.writeEndElement(); // titleStmt
|
361 | 986 | mdecorde | writer.writeStartElement("publicationStmt")
|
362 | 986 | mdecorde | writer.writeEndElement(); // publicationStmt
|
363 | 986 | mdecorde | writer.writeStartElement("sourceDesc")
|
364 | 986 | mdecorde | writer.writeEndElement(); // sourceDesc
|
365 | 986 | mdecorde | writer.writeEndElement(); // fileDesc
|
366 | 986 | mdecorde | writer.writeStartElement("encodingDesc");
|
367 | 986 | mdecorde | writeContentOfEncodingDesc(); |
368 | 986 | mdecorde | writer.writeEndElement(); // encodingDesc
|
369 | 986 | mdecorde | writer.writeEndElement(); // teiHeader
|
370 | 986 | mdecorde | } |
371 | 986 | mdecorde | |
372 | 986 | mdecorde | protected void writeContentOfEncodingDesc() |
373 | 986 | mdecorde | { |
374 | 986 | mdecorde | writer.writeStartElement("appInfo")
|
375 | 986 | mdecorde | this.writeTXMApps();
|
376 | 986 | mdecorde | writer.writeEndElement(); // appInfo
|
377 | 986 | mdecorde | if (!hasClassDecl) {
|
378 | 986 | mdecorde | writer.writeStartElement("classDecl");
|
379 | 986 | mdecorde | this.writeTXMTaxonomies();
|
380 | 986 | mdecorde | writer.writeEndElement(); // classDecl
|
381 | 986 | mdecorde | } |
382 | 986 | mdecorde | } |
383 | 986 | mdecorde | |
384 | 986 | mdecorde | /**
|
385 | 986 | mdecorde | * Check resp.
|
386 | 986 | mdecorde | *
|
387 | 986 | mdecorde | * @return the string
|
388 | 986 | mdecorde | */
|
389 | 986 | mdecorde | public String checkResp() |
390 | 986 | mdecorde | { |
391 | 986 | mdecorde | String rez ="found tags : \n"; |
392 | 986 | mdecorde | for (String key : checkTags.keySet()) |
393 | 986 | mdecorde | rez += "\t"+key+"\n"; |
394 | 986 | mdecorde | return rez;
|
395 | 986 | mdecorde | } |
396 | 986 | mdecorde | |
397 | 986 | mdecorde | /**
|
398 | 986 | mdecorde | * Sets the correspondances.
|
399 | 986 | mdecorde | *
|
400 | 986 | mdecorde | * @param correspRef the corresp ref
|
401 | 986 | mdecorde | * @param correspType the corresp type
|
402 | 986 | mdecorde | */
|
403 | 986 | mdecorde | public void setCorrespondances(correspRef, correspType) |
404 | 986 | mdecorde | { |
405 | 986 | mdecorde | this.correspRef = correspRef;
|
406 | 986 | mdecorde | this.correspType = correspType;
|
407 | 986 | mdecorde | } |
408 | 986 | mdecorde | |
409 | 986 | mdecorde | /**
|
410 | 986 | mdecorde | * Sets the header infos.
|
411 | 986 | mdecorde | *
|
412 | 986 | mdecorde | * @param respId the resp id
|
413 | 986 | mdecorde | * @param resps the resps
|
414 | 986 | mdecorde | * @param applications the applications
|
415 | 986 | mdecorde | * @param taxonomies the taxonomies
|
416 | 986 | mdecorde | * @param items the items
|
417 | 986 | mdecorde | */
|
418 | 986 | mdecorde | public void setHeaderInfos(respId,resps, applications, taxonomies, items) |
419 | 986 | mdecorde | { |
420 | 986 | mdecorde | this.respId = respId
|
421 | 986 | mdecorde | this.resps = resps
|
422 | 986 | mdecorde | this.applications = applications
|
423 | 986 | mdecorde | this.taxonomies = taxonomies;
|
424 | 986 | mdecorde | this.items = items;
|
425 | 986 | mdecorde | } |
426 | 986 | mdecorde | |
427 | 986 | mdecorde | /**
|
428 | 986 | mdecorde | * Write txm resps.
|
429 | 986 | mdecorde | */
|
430 | 986 | mdecorde | public void writeTXMResps() |
431 | 986 | mdecorde | { |
432 | 986 | mdecorde | for (String ref : respId) { |
433 | 986 | mdecorde | String[] infos = resps.get(ref); |
434 | 986 | mdecorde | writer.writeStartElement("respStmt");
|
435 | 986 | mdecorde | writer.writeStartElement(RESP); |
436 | 986 | mdecorde | writer.writeAttribute(ID,ref); |
437 | 986 | mdecorde | writer.writeCharacters(infos[0]);
|
438 | 986 | mdecorde | writer.writeStartElement("date");
|
439 | 986 | mdecorde | writer.writeAttribute("when",infos[2]); |
440 | 986 | mdecorde | writer.writeCharacters(infos[3]);
|
441 | 986 | mdecorde | writer.writeEndElement(); // date
|
442 | 986 | mdecorde | writer.writeEndElement(); //resp
|
443 | 986 | mdecorde | writer.writeStartElement("name");
|
444 | 986 | mdecorde | writer.writeAttribute(TYPE, "person");
|
445 | 986 | mdecorde | writer.writeCharacters(infos[1])
|
446 | 986 | mdecorde | writer.writeEndElement(); // name
|
447 | 986 | mdecorde | writer.writeEndElement(); //respStmt
|
448 | 986 | mdecorde | } |
449 | 986 | mdecorde | } |
450 | 986 | mdecorde | |
451 | 986 | mdecorde | /**
|
452 | 986 | mdecorde | * Write txm apps.
|
453 | 986 | mdecorde | */
|
454 | 986 | mdecorde | public void writeTXMApps() |
455 | 986 | mdecorde | { |
456 | 986 | mdecorde | for (String ref : respId) { |
457 | 986 | mdecorde | List<String> list= applications.get(ref); |
458 | 986 | mdecorde | String ident = list.get(0); |
459 | 986 | mdecorde | String version = list.get(1); |
460 | 986 | mdecorde | File report = list.get(2); |
461 | 986 | mdecorde | |
462 | 986 | mdecorde | writer.writeStartElement(TXMNS, "application");
|
463 | 986 | mdecorde | writer.writeAttribute("ident", ident);
|
464 | 986 | mdecorde | writer.writeAttribute("version", version);
|
465 | 986 | mdecorde | writer.writeAttribute(RESP, ref); |
466 | 986 | mdecorde | |
467 | 986 | mdecorde | //get txm:commandLine from GeneratedReport
|
468 | 986 | mdecorde | if (report != null) { |
469 | 986 | mdecorde | writer.writeCharacters("");writer.flush();
|
470 | 986 | mdecorde | Reader reader = new FileReader(report); |
471 | 986 | mdecorde | String line = reader.readLine();
|
472 | 986 | mdecorde | while (line != null) { |
473 | 986 | mdecorde | if (line.length() != 0) |
474 | 986 | mdecorde | output.write(line+"\n");
|
475 | 986 | mdecorde | line = reader.readLine(); |
476 | 986 | mdecorde | } |
477 | 986 | mdecorde | reader.close(); |
478 | 986 | mdecorde | } |
479 | 986 | mdecorde | |
480 | 986 | mdecorde | writer.writeStartElement("ab");
|
481 | 986 | mdecorde | writer.writeAttribute(TYPE, "annotation");
|
482 | 986 | mdecorde | for (String item : taxonomies.get(ref)) { |
483 | 986 | mdecorde | writer.writeStartElement("list");
|
484 | 986 | mdecorde | writer.writeEmptyElement("ref");
|
485 | 986 | mdecorde | writer.writeAttribute(TYPE, "tagset");
|
486 | 986 | mdecorde | writer.writeAttribute("target", item);
|
487 | 986 | mdecorde | writer.writeEndElement(); // list
|
488 | 986 | mdecorde | } |
489 | 986 | mdecorde | writer.writeEndElement(); // ab
|
490 | 986 | mdecorde | writer.writeEndElement(); // txm:application
|
491 | 986 | mdecorde | } |
492 | 986 | mdecorde | } |
493 | 986 | mdecorde | |
494 | 986 | mdecorde | /**
|
495 | 986 | mdecorde | * Write txm taxonomies.
|
496 | 986 | mdecorde | */
|
497 | 986 | mdecorde | public void writeTXMTaxonomies() |
498 | 986 | mdecorde | { |
499 | 986 | mdecorde | for (String tax : items.keySet()) { |
500 | 986 | mdecorde | writer.writeStartElement("taxonomy");
|
501 | 986 | mdecorde | writer.writeAttribute(ID, tax); |
502 | 986 | mdecorde | |
503 | 986 | mdecorde | writer.writeStartElement("bibl");
|
504 | 986 | mdecorde | writer.writeAttribute(TYPE, "tagset");
|
505 | 986 | mdecorde | writer.writeStartElement("title");
|
506 | 986 | mdecorde | writer.writeCharacters(tax); |
507 | 986 | mdecorde | writer.writeEndElement(); // title
|
508 | 986 | mdecorde | |
509 | 986 | mdecorde | for (String type : items.get(tax).keySet()) { |
510 | 986 | mdecorde | writer.writeEmptyElement("ref");
|
511 | 986 | mdecorde | writer.writeAttribute(TYPE, type); |
512 | 986 | mdecorde | writer.writeAttribute("target", items.get(tax).get(type));
|
513 | 986 | mdecorde | } |
514 | 986 | mdecorde | writer.writeEndElement(); // bibl
|
515 | 986 | mdecorde | writer.writeEndElement(); // taxonomy
|
516 | 986 | mdecorde | } |
517 | 986 | mdecorde | } |
518 | 986 | mdecorde | |
519 | 986 | mdecorde | /**
|
520 | 986 | mdecorde | * The main method.
|
521 | 986 | mdecorde | *
|
522 | 986 | mdecorde | * @param args the arguments
|
523 | 986 | mdecorde | */
|
524 | 986 | mdecorde | public static void main(String[] args) { |
525 | 986 | mdecorde | |
526 | 986 | mdecorde | String rootDir = "~/xml/rgaqcj/"; |
527 | 986 | mdecorde | new File(rootDir+"anainline/").mkdir(); |
528 | 986 | mdecorde | |
529 | 986 | mdecorde | ArrayList<String> milestones = new ArrayList<String>(); |
530 | 986 | mdecorde | |
531 | 986 | mdecorde | String file = "roland-p5.xml"; |
532 | 986 | mdecorde | String anafile = "roland-p5.xml"; |
533 | 986 | mdecorde | |
534 | 986 | mdecorde | def correspType = new HashMap<String,String>() |
535 | 986 | mdecorde | // correspType(attribut word wlx, attribut type de la propriété ana du w txm)
|
536 | 986 | mdecorde | correspType.put("p2","CATTEX2009"); |
537 | 986 | mdecorde | |
538 | 986 | mdecorde | def correspRef = new HashMap<String,String>() |
539 | 986 | mdecorde | // correspRef (attribut word wlx, attribut ref de la propriété ana du w txm. ref pointe vers l'identifiant du respStmt du TEIheader)
|
540 | 986 | mdecorde | correspRef.put("p2","ctx1"); |
541 | 986 | mdecorde | |
542 | 986 | mdecorde | //il faut lister les id de tous les respStmt
|
543 | 986 | mdecorde | def respId = ["ctx1"];//,"TT1", "TnT1"]; |
544 | 986 | mdecorde | |
545 | 986 | mdecorde | //fait la correspondance entre le respId et le rapport d'execution de l'outil
|
546 | 986 | mdecorde | def applications = new HashMap<String,HashMap<String,String>>(); |
547 | 986 | mdecorde | applications.put("ctx1",new ArrayList<String>()); |
548 | 986 | mdecorde | applications.get("ctx1").add("Oxygen");//app ident |
549 | 986 | mdecorde | applications.get("ctx1").add("9.3");//app version |
550 | 986 | mdecorde | applications.get("ctx1").add(null);//app report file path |
551 | 986 | mdecorde | |
552 | 986 | mdecorde | //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
|
553 | 986 | mdecorde | //pour construire les ref vers les taxonomies
|
554 | 986 | mdecorde | def taxonomiesUtilisees = new HashMap<String,String[]>(); |
555 | 986 | mdecorde | taxonomiesUtilisees.put("ctx1",["CATTEX2009"]);//,"lemma","lasla","grace"]); |
556 | 986 | mdecorde | |
557 | 986 | mdecorde | //associe un id d'item avec sa description et son URI
|
558 | 986 | mdecorde | def itemsURI = new HashMap<String,HashMap<String,String>>(); |
559 | 986 | mdecorde | itemsURI.put("CATTEX2009",new HashMap<String,String>()); |
560 | 986 | mdecorde | itemsURI.get("CATTEX2009").put("tagset","http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml"); |
561 | 986 | mdecorde | itemsURI.get("CATTEX2009").put("website","http://bfm.ens-lsh.fr/article.php3?id_article=176"); |
562 | 986 | mdecorde | |
563 | 986 | mdecorde | //informations de respStmt
|
564 | 986 | mdecorde | //resps (respId <voir ci-dessus>, [description, person, date])
|
565 | 986 | mdecorde | def resps = new HashMap<String,String[]>(); |
566 | 986 | mdecorde | resps.put("ctx1", ["initial tagging","alavrentiev","2010-03-02","Tue Mar 2 21:02:55 Paris, Madrid 2010"]) |
567 | 986 | mdecorde | |
568 | 986 | mdecorde | //lance le traitement
|
569 | 986 | mdecorde | def builder = new Xml2Ana(new File(rootDir+"/src/",file)); |
570 | 986 | mdecorde | builder.setCorrespondances(correspRef, correspType); |
571 | 986 | mdecorde | builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI) |
572 | 986 | mdecorde | //dossier de sortie + nom fichier sortie
|
573 | 986 | mdecorde | builder.process(anafile); |
574 | 986 | mdecorde | |
575 | 986 | mdecorde | return
|
576 | 986 | mdecorde | } |
577 | 986 | mdecorde | |
578 | 986 | mdecorde | } |