Statistics
| Revision:

root / tmp / org.txm.core / src / java / org / txm / importer / scripts / xmltxm / Xml2Ana.groovy @ 1000

History | View | Annotate | Download (15.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $
25
// $LastChangedRevision: 3430 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.importer.scripts.xmltxm
29

    
30
import org.txm.importer.StaxIdentityParser;
31

    
32
import java.text.DateFormat;
33
import java.util.ArrayList;
34
import java.util.Date;
35
import java.util.HashMap;
36
import java.util.Locale;
37

    
38
import javax.xml.stream.*;
39

    
40
import java.net.URL;
41

    
42
import org.txm.importer.filters.*;
43
import org.txm.scripts.importer.HasElement
44
import org.txm.scripts.importer.StaxStackWriter
45
import org.txm.utils.AsciiUtils;
46

    
47
/**
48
 * The Class Xml2Ana.
49
 *
50
 * @author mdecorde
51
 * transform : pre xml-tei file >> xml-tei-txm file
52
 * The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt
53
 * 
54
 * you must specify the correspondance between word attributs and ana types&respStmtIDs
55
 * then the attributes of w tags will be transformed into interp tag
56
 */
57
class Xml2Ana extends StaxIdentityParser
58
{
59
        /** The dir. */
60
        private def dir;
61

    
62
        /** The convert all attributes. */
63
        private boolean convertAllAttributes = false;
64

    
65
        /** The corresp type. */
66
        HashMap<String,String> correspType;
67

    
68
        /** The corresp ref. */
69
        HashMap<String,String> correspRef;
70

    
71
        /** The check tags. */
72
        HashMap<String,Boolean> checkTags = new HashMap<String,Boolean>();
73

    
74
        /** The resp id. */
75
        def respId = [];
76

    
77
        /** The applications. */
78
        HashMap<String,File> applications;
79

    
80
        /** The taxonomies. */
81
        HashMap<String,String[]> taxonomies;
82

    
83
        /** The resps. */
84
        HashMap<String,String[]> resps;
85

    
86
        /** The items. */
87
        HashMap<String,HashMap<String,String>> items;
88

    
89
        /** The XML headeradded. */
90
        boolean XMLHeaderadded = false;
91
        String textname;
92
        String wtag = "w";
93
        
94
        public static final String TEXT = "text"
95
        public static final String ID = "id"
96

    
97
        /**
98
         * Instantiates a new xml2 ana.
99
         *
100
         * @param url the url
101
         * @param wordprefix the wordprefix
102
         */
103
        public Xml2Ana(File file) {
104
                super(file.toURI().toURL());
105
                //File file = new File(url.getFile()).getAbsoluteFile()
106
                textname = file.getName();
107
                int idx = textname.lastIndexOf(".");
108
                if (idx > 0)
109
                        textname = textname.substring(0, idx)
110

    
111
                
112
                checkTags.put("respStmt",false);
113
                checkTags.put("titleStmt",false);
114
                checkTags.put("appInfo",false);
115

    
116
                hasText = new HasElement(file, TEXT).process();
117
        }
118

    
119
        /**
120
         * Sets the convert all atrtibutes.
121
         *
122
         * @param value the value
123
         * @return the java.lang. object
124
         */
125
        public setConvertAllAtrtibutes(boolean value) {
126
                convertAllAttributes = value;
127
        }
128
        
129
        /**
130
         * Sets the convert all atrtibutes.
131
         *
132
         * @param value the value
133
         * @return the java.lang. object
134
         */
135
        public setWordTag(String wtag) {
136
                this.wtag = wtag
137
        }
138

    
139
        int idcount = 0;
140
        boolean flagWord = false;
141
        int firstElement = 0;
142
        boolean teiElementAdded = false;
143
        boolean teiHeaderElementAdded = false;
144
        boolean hasText = false;
145
        boolean textElementAdded = false;
146
        def anabalises = [];
147
        protected void processStartElement()
148
        {
149
//                println "checkTags=$checkTags";
150
//                println "parser=$parser";
151
                firstElement++;
152
                
153
                if (this.checkTags.containsKey(parser.getLocalName())) {
154
                        this.checkTags.put(parser.getLocalName(), true);
155
                }
156
                
157
                switch (parser.getLocalName()) {
158
                        case wtag:
159
                                if (!hasText) {
160
                                        writer.writeStartElement(TEXT);
161
                                        writer.writeAttribute(ID, textname);
162
                                        textElementAdded = true;
163
                                        hasText = true;
164
                                }
165
                                idcount++; // increment word counter
166
                                anabalises.clear();
167

    
168
                                writer.writeStartElement(parser.getLocalName()); // write w
169

    
170
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces
171
                                        writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
172

    
173
                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes
174
                                        String type = parser.getAttributeLocalName(i);
175
                                        String value = parser.getAttributeValue(i);
176
                                        if (correspType.containsKey(type)) { // check if txm:ana
177
                                                String corresptype = correspType.get(type);
178
                                                String ref = correspRef.get(type);
179
                                                anabalises.add(["#"+ref, "#"+corresptype, value]);
180
                                        } else if (type == ID) { // keep id attribute 
181
                                                String wordid = value
182
                                                if (wordid.startsWith("w")) {
183
                                                        if (!wordid.startsWith("w_"))
184
                                                                wordid = "w_"+wordid.substring(1)
185
                                                } 
186
//                                                else {
187
//                                                        wordid = "w_"+textname+"_"+wordid;
188
//                                                }
189
                                                
190
                                                wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value
191
                                                
192
                                                writer.writeAttribute(type, wordid);
193
                                                
194
                                        } else { // add attributes that was in the original <w>
195
                                                if (convertAllAttributes)
196
                                                        anabalises.add(["none","#"+type, value])
197
                                                else
198
                                                        writer.writeAttribute(type, value);
199
                                        }
200
                                }
201

    
202
                                flagWord = true; // start to capture the form
203
                                writer.writeStartElement(TXMNS, "form");
204
                                break;
205

    
206
                        case "TEI":
207
                                super.processStartElement();
208
                                boolean hasTeiNS = false;
209
                                boolean hasTXMNs = false;
210
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
211
                                        if (parser.getNamespaceURI(i) == TXMNS)
212
                                                hasTXMNs = true;
213
                                        else if (parser.getNamespaceURI(i) == TEINS)
214
                                                hasTeiNS = true;
215
                                }
216
                                if (!hasTeiNS) {
217
                                        writer.writeDefaultNamespace(TEINS);
218
                                }
219
                                if (!hasTXMNs)
220
                                        writer.writeNamespace(TXM, TXMNS);
221
                                break;
222

    
223
                        default:
224

    
225
                                if (TEXT.equals(localname)) {
226
                                        hasText = true;
227
                                }
228

    
229
                                if (firstElement == 1) { // test if first element is TEI
230
                                        //println "first tag: "+parser.getLocalName()
231
                                        if (localname != "TEI") { // "TEI" is missing
232
                                                teiElementAdded = true;
233
                                                addTEIElement();
234
                                        } else if (!hasText) {
235
                                                writer.writeStartElement(TEXT);
236
                                                writer.writeAttribute(ID, textname);
237
                                                textElementAdded = true;
238
                                                hasText = true;
239
                                        }
240
                                }
241
                                if (firstElement == 2 && teiElementAdded != true) {
242
                                        //println "second tag: "+parser.getLocalName()
243
                                        if (localname != "teiHeader") { // teiHeader is missing
244
                                                writeTeiHeader();
245
                                                hasTeiHeader = true
246
                                                teiHeaderElementAdded = true
247
                                        }
248
                                } else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) {
249
                                        writer.writeStartElement(TEXT);
250
                                        writer.writeAttribute(ID, textname);
251
                                        textElementAdded = true;
252
                                        hasText = true;
253
                                }
254

    
255
                                super.processStartElement();
256
                                if (TEXT.equals(localname)) {
257
                                        if (!parser.getAttributeValue(null, ID)) {
258
                                                writer.writeAttribute(ID, textname);
259
                                        }
260
                                }
261
                }
262
        }
263

    
264
        protected void after()
265
        {
266
                if (textElementAdded) {
267
                        writer.writeEndElement(); // text
268
                }
269
                if (teiElementAdded) {
270
                        writer.writeEndElement(); // TEI
271
                }
272
                super.after(); // close writer, parser, etc
273
        }
274

    
275
        protected void addTEIElement()
276
        {
277
                writer.writeStartElement("TEI");
278
                writer.writeDefaultNamespace(TEINS);
279
                writer.writeNamespace(TXM, TXMNS);
280
                writer.writeNamespace(TEI, TEINS);
281
                writeTeiHeader();
282
        }
283

    
284
        protected void processCharacters()
285
        {
286
                if (flagWord) {
287
                        writer.writeCharacters(parser.getText().trim()); // keep form in 1 line
288
                } else {
289
                        super.processCharacters();
290
                }
291
        }
292
        
293
        boolean hasClassDecl = false;
294
        boolean hasFileDesc = false;
295
        boolean hasEncodingDesc = false;
296
        boolean hasTeiHeader = false;
297
        boolean hasTEI = false;
298
        public static String ANA = "ana"
299
        public static String RESP = "resp"
300
        public static String TYPE = "type"
301
        protected void processEndElement()
302
        {
303
                switch (parser.getLocalName()) {
304
                        case wtag:
305
                                writer.writeEndElement(); // txm:form
306
                                for (def values : anabalises)
307
                                {// <txm:ana resp=ref type=corresptype>value</txm:ana>
308
                                        writer.writeStartElement(TXMNS, ANA);
309
                                        writer.writeAttribute(RESP, values[0]);
310
                                        writer.writeAttribute(TYPE, values[1]);
311
                                        writer.writeCharacters(values[2]);
312
                                        writer.writeEndElement(); // txm:ana
313
                                }
314

    
315
                                flagWord = false;
316
                                break;
317

    
318
                        case "fileDesc":
319
                                hasFileDesc = true;
320
                                this.writeTXMResps();
321
                                break;
322

    
323
                        case "classDecl":
324
                                hasClassDecl=true;
325
                                this.writeTXMTaxonomies();
326
                                break;
327
                        case "encodingDesc":
328
                                hasEncodingDesc = true;
329
                                writeContentOfEncodingDesc();
330
                                break;
331

    
332
                        case "teiHeader":
333
                                hasTeiHeader = true
334
                                if (!hasEncodingDesc) {
335
                                        writer.writeStartElement("encodingDesc");
336
                                        writeContentOfEncodingDesc();
337
                                        writer.writeEndElement();
338
                                }
339

    
340
                                break;
341
                        case "TEI":
342
                                hasTEI = true;
343
                                if (!hasTeiHeader) {
344
                                        writeTeiHeader();
345
                                }
346
                                break;
347
                }
348

    
349
                super.processEndElement();
350
        }
351

    
352
        protected void writeTeiHeader()
353
        {
354
                writer.writeStartElement("teiHeader");
355
                writer.writeStartElement("fileDesc")
356
                this.writeTXMResps();
357
                writer.writeStartElement("titleStmt")
358
                writer.writeStartElement("title")
359
                writer.writeEndElement(); // title
360
                writer.writeEndElement(); // titleStmt
361
                writer.writeStartElement("publicationStmt")
362
                writer.writeEndElement(); // publicationStmt
363
                writer.writeStartElement("sourceDesc")
364
                writer.writeEndElement(); // sourceDesc
365
                writer.writeEndElement(); // fileDesc
366
                writer.writeStartElement("encodingDesc");
367
                writeContentOfEncodingDesc();
368
                writer.writeEndElement(); // encodingDesc
369
                writer.writeEndElement(); // teiHeader
370
        }
371

    
372
        protected void writeContentOfEncodingDesc()
373
        {
374
                writer.writeStartElement("appInfo")
375
                this.writeTXMApps();
376
                writer.writeEndElement(); // appInfo
377
                if (!hasClassDecl) {
378
                        writer.writeStartElement("classDecl");
379
                        this.writeTXMTaxonomies();
380
                        writer.writeEndElement(); // classDecl
381
                }
382
        }
383

    
384
        /**
385
         * Check resp.
386
         *
387
         * @return the string
388
         */
389
        public String checkResp()
390
        {
391
                String rez ="found tags : \n";
392
                for (String key : checkTags.keySet())
393
                        rez += "\t"+key+"\n";
394
                return rez;
395
        }
396

    
397
        /**
398
         * Sets the correspondances.
399
         *
400
         * @param correspRef the corresp ref
401
         * @param correspType the corresp type
402
         */
403
        public void setCorrespondances(correspRef, correspType)
404
        {
405
                this.correspRef = correspRef;
406
                this.correspType = correspType;
407
        }
408

    
409
        /**
410
         * Sets the header infos.
411
         *
412
         * @param respId the resp id
413
         * @param resps the resps
414
         * @param applications the applications
415
         * @param taxonomies the taxonomies
416
         * @param items the items
417
         */
418
        public void setHeaderInfos(respId,resps, applications, taxonomies, items)
419
        {
420
                this.respId = respId
421
                this.resps = resps
422
                this.applications = applications
423
                this.taxonomies = taxonomies;
424
                this.items = items;
425
        }
426

    
427
        /**
428
         * Write txm resps.
429
         */
430
        public void writeTXMResps()
431
        {
432
                for (String ref : respId) {
433
                        String[] infos = resps.get(ref);
434
                        writer.writeStartElement("respStmt");
435
                        writer.writeStartElement(RESP);
436
                        writer.writeAttribute(ID,ref);
437
                        writer.writeCharacters(infos[0]);
438
                        writer.writeStartElement("date");
439
                        writer.writeAttribute("when",infos[2]);
440
                        writer.writeCharacters(infos[3]);
441
                        writer.writeEndElement(); // date
442
                        writer.writeEndElement(); //resp
443
                        writer.writeStartElement("name");
444
                        writer.writeAttribute(TYPE, "person");
445
                        writer.writeCharacters(infos[1])
446
                        writer.writeEndElement(); // name
447
                        writer.writeEndElement(); //respStmt
448
                }
449
        }
450

    
451
        /**
452
         * Write txm apps.
453
         */
454
        public void writeTXMApps()
455
        {
456
                for (String ref : respId) {
457
                        List<String> list= applications.get(ref);
458
                        String ident = list.get(0);
459
                        String version = list.get(1);
460
                        File report = list.get(2);
461

    
462
                        writer.writeStartElement(TXMNS, "application");
463
                        writer.writeAttribute("ident", ident);
464
                        writer.writeAttribute("version", version);
465
                        writer.writeAttribute(RESP, ref);
466

    
467
                        //get txm:commandLine from GeneratedReport
468
                        if (report != null) {
469
                                writer.writeCharacters("");writer.flush();
470
                                Reader reader = new FileReader(report);
471
                                String line = reader.readLine();
472
                                while (line != null) {
473
                                        if (line.length() != 0)
474
                                                output.write(line+"\n");
475
                                        line = reader.readLine();
476
                                }
477
                                reader.close();
478
                        }
479

    
480
                        writer.writeStartElement("ab");
481
                        writer.writeAttribute(TYPE, "annotation");
482
                        for (String item : taxonomies.get(ref)) {
483
                                writer.writeStartElement("list");
484
                                writer.writeEmptyElement("ref");
485
                                writer.writeAttribute(TYPE, "tagset");
486
                                writer.writeAttribute("target", item);
487
                                writer.writeEndElement(); // list
488
                        }
489
                        writer.writeEndElement(); // ab
490
                        writer.writeEndElement(); // txm:application
491
                }
492
        }
493

    
494
        /**
495
         * Write txm taxonomies.
496
         */
497
        public void writeTXMTaxonomies()
498
        {
499
                for (String tax : items.keySet()) {
500
                        writer.writeStartElement("taxonomy");
501
                        writer.writeAttribute(ID, tax);
502

    
503
                        writer.writeStartElement("bibl");
504
                        writer.writeAttribute(TYPE, "tagset");
505
                        writer.writeStartElement("title");
506
                        writer.writeCharacters(tax);
507
                        writer.writeEndElement(); // title
508

    
509
                        for (String type : items.get(tax).keySet()) {
510
                                writer.writeEmptyElement("ref");
511
                                writer.writeAttribute(TYPE, type);
512
                                writer.writeAttribute("target", items.get(tax).get(type));
513
                        }
514
                        writer.writeEndElement(); // bibl
515
                        writer.writeEndElement(); // taxonomy
516
                }
517
        }
518

    
519
        /**
520
         * The main method.
521
         *
522
         * @param args the arguments
523
         */
524
        public static void main(String[] args) {
525

    
526
                String rootDir = "~/xml/rgaqcj/";
527
                new File(rootDir+"anainline/").mkdir();
528

    
529
                ArrayList<String> milestones = new ArrayList<String>();
530

    
531
                String file = "roland-p5.xml";
532
                String anafile = "roland-p5.xml";
533

    
534
                def correspType = new HashMap<String,String>()
535
                // correspType(attribut word wlx, attribut type de la propriété ana du w txm)
536
                correspType.put("p2","CATTEX2009");
537

    
538
                def correspRef = new HashMap<String,String>()
539
                // correspRef (attribut word wlx, attribut ref de la propriété ana du w txm. ref pointe vers l'identifiant du respStmt du TEIheader)
540
                correspRef.put("p2","ctx1");
541

    
542
                //il faut lister les id de tous les respStmt
543
                def respId = ["ctx1"];//,"TT1", "TnT1"];
544

    
545
                //fait la correspondance entre le respId et le rapport d'execution de l'outil
546
                def applications = new HashMap<String,HashMap<String,String>>();
547
                applications.put("ctx1",new ArrayList<String>());
548
                applications.get("ctx1").add("Oxygen");//app ident
549
                applications.get("ctx1").add("9.3");//app version
550
                applications.get("ctx1").add(null);//app report file path
551

    
552
                //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
553
                //pour construire les ref vers les taxonomies
554
                def taxonomiesUtilisees = new HashMap<String,String[]>();
555
                taxonomiesUtilisees.put("ctx1",["CATTEX2009"]);//,"lemma","lasla","grace"]);
556

    
557
                //associe un id d'item avec sa description et son URI
558
                def itemsURI = new HashMap<String,HashMap<String,String>>();
559
                itemsURI.put("CATTEX2009",new HashMap<String,String>());
560
                itemsURI.get("CATTEX2009").put("tagset","http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml");
561
                itemsURI.get("CATTEX2009").put("website","http://bfm.ens-lsh.fr/article.php3?id_article=176");
562

    
563
                //informations de respStmt
564
                //resps (respId <voir ci-dessus>, [description, person, date])
565
                def resps = new HashMap<String,String[]>();
566
                resps.put("ctx1", ["initial tagging","alavrentiev","2010-03-02","Tue Mar  2 21:02:55 Paris, Madrid 2010"])
567

    
568
                //lance le traitement
569
                def builder = new Xml2Ana(new File(rootDir+"/src/",file));
570
                builder.setCorrespondances(correspRef, correspType);
571
                builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
572
                //dossier de sortie + nom fichier sortie
573
                builder.process(anafile);
574

    
575
                return
576
        }
577

    
578
}