Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / Xml2Ana.groovy @ 479

History | View | Annotate | Download (15.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $
25
// $LastChangedRevision: 3430 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29

    
30
import org.txm.importer.HasElement;
31
import org.txm.importer.StaxIdentityParser;
32

    
33
import java.text.DateFormat;
34
import java.util.ArrayList;
35
import java.util.Date;
36
import java.util.HashMap;
37
import java.util.Locale;
38

    
39
import javax.xml.stream.*;
40

    
41
import java.net.URL;
42

    
43
import org.txm.importer.filters.*;
44
import org.txm.utils.AsciiUtils;
45

    
46
// TODO: Auto-generated Javadoc
47
/**
48
 * The Class Xml2Ana.
49
 *
50
 * @author mdecorde
51
 * transform : pre xml-tei file >> xml-tei-txm file
52
 * The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt
53
 * 
54
 * you must specify the correspondance between word attributs and ana types&respStmtIDs
55
 * then the attributes of w tags will be transformed into interp tag
56
 */
57
public class Xml2Ana extends StaxIdentityParser
58
{
59
        /** The dir. */
60
        private def dir;
61

    
62
        /** The convert all attributes. */
63
        private boolean convertAllAttributes = false;
64

    
65
        /** The corresp type. */
66
        HashMap<String,String> correspType;
67

    
68
        /** The corresp ref. */
69
        HashMap<String,String> correspRef;
70

    
71
        /** The check tags. */
72
        HashMap<String,Boolean> checkTags;
73

    
74
        /** The resp id. */
75
        def respId = [];
76

    
77
        /** The applications. */
78
        HashMap<String,File> applications;
79

    
80
        /** The taxonomies. */
81
        HashMap<String,String[]> taxonomies;
82

    
83
        /** The resps. */
84
        HashMap<String,String[]> resps;
85

    
86
        /** The items. */
87
        HashMap<String,HashMap<String,String>> items;
88

    
89
        /** The XML headeradded. */
90
        boolean XMLHeaderadded = false;
91
        String textname;
92
        String wtag = "w";
93
        
94
        public static final String TEXT = "text"
95
        public static final String ID = "id"
96

    
97
        /**
98
         * Instantiates a new xml2 ana.
99
         *
100
         * @param url the url
101
         * @param wordprefix the wordprefix
102
         */
103
        public Xml2Ana(File file) {
104
                super(file.toURI().toURL());
105
                //File file = new File(url.getFile()).getAbsoluteFile()
106
                textname = file.getName();
107
                int idx = textname.lastIndexOf(".");
108
                if (idx > 0)
109
                        textname = textname.substring(0, idx)
110

    
111
                checkTags = new HashMap<String,Boolean>();
112
                checkTags.put("respStmt",false);
113
                checkTags.put("titleStmt",false);
114
                checkTags.put("appInfo",false);
115

    
116
                hasText = new HasElement(file, TEXT).process();
117
        }
118

    
119
        /**
120
         * Sets the convert all atrtibutes.
121
         *
122
         * @param value the value
123
         * @return the java.lang. object
124
         */
125
        public setConvertAllAtrtibutes(boolean value) {
126
                convertAllAttributes = value;
127
        }
128
        
129
        /**
130
         * Sets the convert all atrtibutes.
131
         *
132
         * @param value the value
133
         * @return the java.lang. object
134
         */
135
        public setWordTag(String wtag) {
136
                this.wtag = wtag
137
        }
138

    
139
        int idcount = 0;
140
        boolean flagWord = false;
141
        int firstElement = 0;
142
        boolean teiElementAdded = false;
143
        boolean teiHeaderElementAdded = false;
144
        boolean hasText = false;
145
        boolean textElementAdded = false;
146
        def anabalises = [];
147
        protected void processStartElement()
148
        {
149
                //String localname = parser.getLocalName();
150
                firstElement++;
151
                if (checkTags.containsKey(parser.getLocalName()))
152
                        checkTags.put(parser.getLocalName(), true);
153

    
154
                switch (parser.getLocalName()) {
155
                        case wtag:
156
                                if (!hasText) {
157
                                        writer.writeStartElement(TEXT);
158
                                        writer.writeAttribute(ID, textname);
159
                                        textElementAdded = true;
160
                                        hasText = true;
161
                                }
162
                                idcount++; // increment word counter
163
                                anabalises.clear();
164

    
165
                                writer.writeStartElement(parser.getLocalName()); // write w
166

    
167
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces
168
                                        writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
169

    
170
                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes
171
                                        String type = parser.getAttributeLocalName(i);
172
                                        String value = parser.getAttributeValue(i);
173
                                        if (correspType.containsKey(type)) { // check if txm:ana
174
                                                String corresptype = correspType.get(type);
175
                                                String ref = correspRef.get(type);
176
                                                anabalises.add(["#"+ref, "#"+corresptype, value]);
177
                                        } else if (type == ID) { // keep id attribute 
178
                                                String wordid = value
179
                                                if (wordid.startsWith("w")) {
180
                                                        if (!wordid.startsWith("w_"))
181
                                                                wordid = "w_"+wordid.substring(1)
182
                                                } 
183
//                                                else {
184
//                                                        wordid = "w_"+textname+"_"+wordid;
185
//                                                }
186
                                                
187
                                                wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value
188
                                                
189
                                                writer.writeAttribute(type, wordid);
190
                                                
191
                                        } else { // add attributes that was in the original <w>
192
                                                if (convertAllAttributes)
193
                                                        anabalises.add(["none","#"+type, value])
194
                                                else
195
                                                        writer.writeAttribute(type, value);
196
                                        }
197
                                }
198

    
199
                                flagWord = true; // start to capture the form
200
                                writer.writeStartElement(TXMNS, "form");
201
                                break;
202

    
203
                        case "TEI":
204
                                super.processStartElement();
205
                                boolean hasTeiNS = false;
206
                                boolean hasTXMNs = false;
207
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
208
                                        if (parser.getNamespaceURI(i) == TXMNS)
209
                                                hasTXMNs = true;
210
                                        else if (parser.getNamespaceURI(i) == TEINS)
211
                                                hasTeiNS = true;
212
                                }
213
                                if (!hasTeiNS) {
214
                                        writer.writeDefaultNamespace(TEINS);
215
                                }
216
                                if (!hasTXMNs)
217
                                        writer.writeNamespace(TXM, TXMNS);
218
                                break;
219

    
220
                        default:
221

    
222
                                if (TEXT.equals(localname)) {
223
                                        hasText = true;
224
                                }
225

    
226
                                if (firstElement == 1) { // test if first element is TEI
227
                                        //println "first tag: "+parser.getLocalName()
228
                                        if (localname != "TEI") { // "TEI" is missing
229
                                                teiElementAdded = true;
230
                                                addTEIElement();
231
                                        } else if (!hasText) {
232
                                                writer.writeStartElement(TEXT);
233
                                                writer.writeAttribute(ID, textname);
234
                                                textElementAdded = true;
235
                                                hasText = true;
236
                                        }
237
                                }
238
                                if (firstElement == 2 && teiElementAdded != true) {
239
                                        //println "second tag: "+parser.getLocalName()
240
                                        if (localname != "teiHeader") { // teiHeader is missing
241
                                                writeTeiHeader();
242
                                                hasTeiHeader = true
243
                                                teiHeaderElementAdded = true
244
                                        }
245
                                } else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) {
246
                                        writer.writeStartElement(TEXT);
247
                                        writer.writeAttribute(ID, textname);
248
                                        textElementAdded = true;
249
                                        hasText = true;
250
                                }
251

    
252
                                super.processStartElement();
253
                                if (TEXT.equals(localname)) {
254
                                        if (!parser.getAttributeValue(null, ID)) {
255
                                                writer.writeAttribute(ID, textname);
256
                                        }
257
                                }
258
                }
259
        }
260

    
261
        protected void after()
262
        {
263
                if (textElementAdded) {
264
                        writer.writeEndElement(); // text
265
                }
266
                if (teiElementAdded) {
267
                        writer.writeEndElement(); // TEI
268
                }
269
                super.after(); // close writer, parser, etc
270
        }
271

    
272
        protected void addTEIElement()
273
        {
274
                writer.writeStartElement("TEI");
275
                writer.writeDefaultNamespace(TEINS);
276
                writer.writeNamespace(TXM, TXMNS);
277
                writer.writeNamespace(TEI, TEINS);
278
                writeTeiHeader();
279
        }
280

    
281
        protected void processCharacters()
282
        {
283
                if (flagWord) {
284
                        writer.writeCharacters(parser.getText().trim()); // keep form in 1 line
285
                } else {
286
                        super.processCharacters();
287
                }
288
        }
289
        
290
        boolean hasClassDecl = false;
291
        boolean hasFileDesc = false;
292
        boolean hasEncodingDesc = false;
293
        boolean hasTeiHeader = false;
294
        boolean hasTEI = false;
295
        public static String ANA = "ana"
296
        public static String RESP = "resp"
297
        public static String TYPE = "type"
298
        protected void processEndElement()
299
        {
300
                switch (parser.getLocalName()) {
301
                        case wtag:
302
                                writer.writeEndElement(); // txm:form
303
                                for (def values : anabalises)
304
                                {// <txm:ana resp=ref type=corresptype>value</txm:ana>
305
                                        writer.writeStartElement(TXMNS, ANA);
306
                                        writer.writeAttribute(RESP, values[0]);
307
                                        writer.writeAttribute(TYPE, values[1]);
308
                                        writer.writeCharacters(values[2]);
309
                                        writer.writeEndElement(); // txm:ana
310
                                }
311

    
312
                                flagWord = false;
313
                                break;
314

    
315
                        case "fileDesc":
316
                                hasFileDesc = true;
317
                                this.writeTXMResps();
318
                                break;
319

    
320
                        case "classDecl":
321
                                hasClassDecl=true;
322
                                this.writeTXMTaxonomies();
323
                                break;
324
                        case "encodingDesc":
325
                                hasEncodingDesc = true;
326
                                writeContentOfEncodingDesc();
327
                                break;
328

    
329
                        case "teiHeader":
330
                                hasTeiHeader = true
331
                                if (!hasEncodingDesc) {
332
                                        writer.writeStartElement("encodingDesc");
333
                                        writeContentOfEncodingDesc();
334
                                        writer.writeEndElement();
335
                                }
336

    
337
                                break;
338
                        case "TEI":
339
                                hasTEI = true;
340
                                if (!hasTeiHeader) {
341
                                        writeTeiHeader();
342
                                }
343
                                break;
344
                }
345

    
346
                super.processEndElement();
347
        }
348

    
349
        protected void writeTeiHeader()
350
        {
351
                writer.writeStartElement("teiHeader");
352
                writer.writeStartElement("fileDesc")
353
                this.writeTXMResps();
354
                writer.writeStartElement("titleStmt")
355
                writer.writeStartElement("title")
356
                writer.writeEndElement(); // title
357
                writer.writeEndElement(); // titleStmt
358
                writer.writeStartElement("publicationStmt")
359
                writer.writeEndElement(); // publicationStmt
360
                writer.writeStartElement("sourceDesc")
361
                writer.writeEndElement(); // sourceDesc
362
                writer.writeEndElement(); // fileDesc
363
                writer.writeStartElement("encodingDesc");
364
                writeContentOfEncodingDesc();
365
                writer.writeEndElement(); // encodingDesc
366
                writer.writeEndElement(); // teiHeader
367
        }
368

    
369
        protected void writeContentOfEncodingDesc()
370
        {
371
                writer.writeStartElement("appInfo")
372
                this.writeTXMApps();
373
                writer.writeEndElement(); // appInfo
374
                if (!hasClassDecl) {
375
                        writer.writeStartElement("classDecl");
376
                        this.writeTXMTaxonomies();
377
                        writer.writeEndElement(); // classDecl
378
                }
379
        }
380

    
381
        /**
382
         * Check resp.
383
         *
384
         * @return the string
385
         */
386
        public String checkResp()
387
        {
388
                String rez ="found tags : \n";
389
                for (String key : checkTags)
390
                        rez += "\t"+key+"\n";
391
                return rez;
392
        }
393

    
394
        /**
395
         * Sets the correspondances.
396
         *
397
         * @param correspRef the corresp ref
398
         * @param correspType the corresp type
399
         */
400
        public void setCorrespondances(correspRef, correspType)
401
        {
402
                this.correspRef = correspRef;
403
                this.correspType = correspType;
404
        }
405

    
406
        /**
407
         * Sets the header infos.
408
         *
409
         * @param respId the resp id
410
         * @param resps the resps
411
         * @param applications the applications
412
         * @param taxonomies the taxonomies
413
         * @param items the items
414
         */
415
        public void setHeaderInfos(respId,resps, applications, taxonomies, items)
416
        {
417
                this.respId = respId
418
                this.resps = resps
419
                this.applications = applications
420
                this.taxonomies = taxonomies;
421
                this.items = items;
422
        }
423

    
424
        /**
425
         * Write txm resps.
426
         */
427
        public void writeTXMResps()
428
        {
429
                for (String ref : respId) {
430
                        String[] infos = resps.get(ref);
431
                        writer.writeStartElement("respStmt");
432
                        writer.writeStartElement(RESP);
433
                        writer.writeAttribute(ID,ref);
434
                        writer.writeCharacters(infos[0]);
435
                        writer.writeStartElement("date");
436
                        writer.writeAttribute("when",infos[2]);
437
                        writer.writeCharacters(infos[3]);
438
                        writer.writeEndElement(); // date
439
                        writer.writeEndElement(); //resp
440
                        writer.writeStartElement("name");
441
                        writer.writeAttribute(TYPE, "person");
442
                        writer.writeCharacters(infos[1])
443
                        writer.writeEndElement(); // name
444
                        writer.writeEndElement(); //respStmt
445
                }
446
        }
447

    
448
        /**
449
         * Write txm apps.
450
         */
451
        public void writeTXMApps()
452
        {
453
                for (String ref : respId) {
454
                        List<String> list= applications.get(ref);
455
                        String ident = list.get(0);
456
                        String version = list.get(1);
457
                        File report = list.get(2);
458

    
459
                        writer.writeStartElement(TXMNS, "application");
460
                        writer.writeAttribute("ident", ident);
461
                        writer.writeAttribute("version", version);
462
                        writer.writeAttribute(RESP, ref);
463

    
464
                        //get txm:commandLine from GeneratedReport
465
                        if (report != null) {
466
                                writer.writeCharacters("");writer.flush();
467
                                Reader reader = new FileReader(report);
468
                                String line = reader.readLine();
469
                                while (line != null) {
470
                                        if (line.length() != 0)
471
                                                output.write(line+"\n");
472
                                        line = reader.readLine();
473
                                }
474
                                reader.close();
475
                        }
476

    
477
                        writer.writeStartElement("ab");
478
                        writer.writeAttribute(TYPE, "annotation");
479
                        for (String item : taxonomies.get(ref)) {
480
                                writer.writeStartElement("list");
481
                                writer.writeEmptyElement("ref");
482
                                writer.writeAttribute(TYPE, "tagset");
483
                                writer.writeAttribute("target", item);
484
                                writer.writeEndElement(); // list
485
                        }
486
                        writer.writeEndElement(); // ab
487
                        writer.writeEndElement(); // txm:application
488
                }
489
        }
490

    
491
        /**
492
         * Write txm taxonomies.
493
         */
494
        public void writeTXMTaxonomies()
495
        {
496
                for (String tax : items.keySet()) {
497
                        writer.writeStartElement("taxonomy");
498
                        writer.writeAttribute(ID, tax);
499

    
500
                        writer.writeStartElement("bibl");
501
                        writer.writeAttribute(TYPE, "tagset");
502
                        writer.writeStartElement("title");
503
                        writer.writeCharacters(tax);
504
                        writer.writeEndElement(); // title
505

    
506
                        for (String type : items.get(tax).keySet()) {
507
                                writer.writeEmptyElement("ref");
508
                                writer.writeAttribute(TYPE, type);
509
                                writer.writeAttribute("target", items.get(tax).get(type));
510
                        }
511
                        writer.writeEndElement(); // bibl
512
                        writer.writeEndElement(); // taxonomy
513
                }
514
        }
515

    
516
        /**
517
         * The main method.
518
         *
519
         * @param args the arguments
520
         */
521
        public static void main(String[] args) {
522

    
523
                String rootDir = "~/xml/rgaqcj/";
524
                new File(rootDir+"anainline/").mkdir();
525

    
526
                ArrayList<String> milestones = new ArrayList<String>();
527

    
528
                String file = "roland-p5.xml";
529
                String anafile = "roland-p5.xml";
530

    
531
                def correspType = new HashMap<String,String>()
532
                // correspType(attribut word wlx, attribut type de la propriété ana du w txm)
533
                correspType.put("p2","CATTEX2009");
534

    
535
                def correspRef = new HashMap<String,String>()
536
                // correspRef (attribut word wlx, attribut ref de la propriété ana du w txm. ref pointe vers l'identifiant du respStmt du TEIheader)
537
                correspRef.put("p2","ctx1");
538

    
539
                //il faut lister les id de tous les respStmt
540
                def respId = ["ctx1"];//,"TT1", "TnT1"];
541

    
542
                //fait la correspondance entre le respId et le rapport d'execution de l'outil
543
                def applications = new HashMap<String,HashMap<String,String>>();
544
                applications.put("ctx1",new ArrayList<String>());
545
                applications.get("ctx1").add("Oxygen");//app ident
546
                applications.get("ctx1").add("9.3");//app version
547
                applications.get("ctx1").add(null);//app report file path
548

    
549
                //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
550
                //pour construire les ref vers les taxonomies
551
                def taxonomiesUtilisees = new HashMap<String,String[]>();
552
                taxonomiesUtilisees.put("ctx1",["CATTEX2009"]);//,"lemma","lasla","grace"]);
553

    
554
                //associe un id d'item avec sa description et son URI
555
                def itemsURI = new HashMap<String,HashMap<String,String>>();
556
                itemsURI.put("CATTEX2009",new HashMap<String,String>());
557
                itemsURI.get("CATTEX2009").put("tagset","http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml");
558
                itemsURI.get("CATTEX2009").put("website","http://bfm.ens-lsh.fr/article.php3?id_article=176");
559

    
560
                //informations de respStmt
561
                //resps (respId <voir ci-dessus>, [description, person, date])
562
                def resps = new HashMap<String,String[]>();
563
                resps.put("ctx1", ["initial tagging","alavrentiev","2010-03-02","Tue Mar  2 21:02:55 Paris, Madrid 2010"])
564

    
565
                //lance le traitement
566
                def builder = new Xml2Ana(new File(rootDir+"/src/",file));
567
                builder.setCorrespondances(correspRef, correspType);
568
                builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
569
                //dossier de sortie + nom fichier sortie
570
                builder.process(anafile);
571

    
572
                return
573
        }
574

    
575
}