Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / Xml2Ana.groovy @ 812

History | View | Annotate | Download (15.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $
25
// $LastChangedRevision: 3430 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29

    
30
import org.txm.importer.HasElement;
31
import org.txm.importer.StaxIdentityParser;
32

    
33
import java.text.DateFormat;
34
import java.util.ArrayList;
35
import java.util.Date;
36
import java.util.HashMap;
37
import java.util.Locale;
38

    
39
import javax.xml.stream.*;
40

    
41
import java.net.URL;
42

    
43
import org.txm.importer.filters.*;
44
import org.txm.utils.AsciiUtils;
45

    
46
/**
47
 * The Class Xml2Ana.
48
 *
49
 * @author mdecorde
50
 * transform : pre xml-tei file >> xml-tei-txm file
51
 * The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt
52
 * 
53
 * you must specify the correspondance between word attributs and ana types&respStmtIDs
54
 * then the attributes of w tags will be transformed into interp tag
55
 */
56
public class Xml2Ana extends StaxIdentityParser
57
{
58
        /** The dir. */
59
        private def dir;
60

    
61
        /** The convert all attributes. */
62
        private boolean convertAllAttributes = false;
63

    
64
        /** The corresp type. */
65
        HashMap<String,String> correspType;
66

    
67
        /** The corresp ref. */
68
        HashMap<String,String> correspRef;
69

    
70
        /** The check tags. */
71
        HashMap<String,Boolean> checkTags = new HashMap<String,Boolean>();
72

    
73
        /** The resp id. */
74
        def respId = [];
75

    
76
        /** The applications. */
77
        HashMap<String,File> applications;
78

    
79
        /** The taxonomies. */
80
        HashMap<String,String[]> taxonomies;
81

    
82
        /** The resps. */
83
        HashMap<String,String[]> resps;
84

    
85
        /** The items. */
86
        HashMap<String,HashMap<String,String>> items;
87

    
88
        /** The XML headeradded. */
89
        boolean XMLHeaderadded = false;
90
        String textname;
91
        String wtag = "w";
92
        
93
        public static final String TEXT = "text"
94
        public static final String ID = "id"
95

    
96
        /**
97
         * Instantiates a new xml2 ana.
98
         *
99
         * @param url the url
100
         * @param wordprefix the wordprefix
101
         */
102
        public Xml2Ana(File file) {
103
                super(file.toURI().toURL());
104
                //File file = new File(url.getFile()).getAbsoluteFile()
105
                textname = file.getName();
106
                int idx = textname.lastIndexOf(".");
107
                if (idx > 0)
108
                        textname = textname.substring(0, idx)
109

    
110
                
111
                checkTags.put("respStmt",false);
112
                checkTags.put("titleStmt",false);
113
                checkTags.put("appInfo",false);
114

    
115
                hasText = new HasElement(file, TEXT).process();
116
        }
117

    
118
        /**
119
         * Sets the convert all atrtibutes.
120
         *
121
         * @param value the value
122
         * @return the java.lang. object
123
         */
124
        public setConvertAllAtrtibutes(boolean value) {
125
                convertAllAttributes = value;
126
        }
127
        
128
        /**
129
         * Sets the convert all atrtibutes.
130
         *
131
         * @param value the value
132
         * @return the java.lang. object
133
         */
134
        public setWordTag(String wtag) {
135
                this.wtag = wtag
136
        }
137

    
138
        int idcount = 0;
139
        boolean flagWord = false;
140
        int firstElement = 0;
141
        boolean teiElementAdded = false;
142
        boolean teiHeaderElementAdded = false;
143
        boolean hasText = false;
144
        boolean textElementAdded = false;
145
        def anabalises = [];
146
        protected void processStartElement()
147
        {
148
//                println "checkTags=$checkTags";
149
//                println "parser=$parser";
150
                firstElement++;
151
                
152
                if (this.checkTags.containsKey(parser.getLocalName())) {
153
                        this.checkTags.put(parser.getLocalName(), true);
154
                }
155
                
156
                switch (parser.getLocalName()) {
157
                        case wtag:
158
                                if (!hasText) {
159
                                        writer.writeStartElement(TEXT);
160
                                        writer.writeAttribute(ID, textname);
161
                                        textElementAdded = true;
162
                                        hasText = true;
163
                                }
164
                                idcount++; // increment word counter
165
                                anabalises.clear();
166

    
167
                                writer.writeStartElement(parser.getLocalName()); // write w
168

    
169
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces
170
                                        writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
171

    
172
                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes
173
                                        String type = parser.getAttributeLocalName(i);
174
                                        String value = parser.getAttributeValue(i);
175
                                        if (correspType.containsKey(type)) { // check if txm:ana
176
                                                String corresptype = correspType.get(type);
177
                                                String ref = correspRef.get(type);
178
                                                anabalises.add(["#"+ref, "#"+corresptype, value]);
179
                                        } else if (type == ID) { // keep id attribute 
180
                                                String wordid = value
181
                                                if (wordid.startsWith("w")) {
182
                                                        if (!wordid.startsWith("w_"))
183
                                                                wordid = "w_"+wordid.substring(1)
184
                                                } 
185
//                                                else {
186
//                                                        wordid = "w_"+textname+"_"+wordid;
187
//                                                }
188
                                                
189
                                                wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value
190
                                                
191
                                                writer.writeAttribute(type, wordid);
192
                                                
193
                                        } else { // add attributes that was in the original <w>
194
                                                if (convertAllAttributes)
195
                                                        anabalises.add(["none","#"+type, value])
196
                                                else
197
                                                        writer.writeAttribute(type, value);
198
                                        }
199
                                }
200

    
201
                                flagWord = true; // start to capture the form
202
                                writer.writeStartElement(TXMNS, "form");
203
                                break;
204

    
205
                        case "TEI":
206
                                super.processStartElement();
207
                                boolean hasTeiNS = false;
208
                                boolean hasTXMNs = false;
209
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
210
                                        if (parser.getNamespaceURI(i) == TXMNS)
211
                                                hasTXMNs = true;
212
                                        else if (parser.getNamespaceURI(i) == TEINS)
213
                                                hasTeiNS = true;
214
                                }
215
                                if (!hasTeiNS) {
216
                                        writer.writeDefaultNamespace(TEINS);
217
                                }
218
                                if (!hasTXMNs)
219
                                        writer.writeNamespace(TXM, TXMNS);
220
                                break;
221

    
222
                        default:
223

    
224
                                if (TEXT.equals(localname)) {
225
                                        hasText = true;
226
                                }
227

    
228
                                if (firstElement == 1) { // test if first element is TEI
229
                                        //println "first tag: "+parser.getLocalName()
230
                                        if (localname != "TEI") { // "TEI" is missing
231
                                                teiElementAdded = true;
232
                                                addTEIElement();
233
                                        } else if (!hasText) {
234
                                                writer.writeStartElement(TEXT);
235
                                                writer.writeAttribute(ID, textname);
236
                                                textElementAdded = true;
237
                                                hasText = true;
238
                                        }
239
                                }
240
                                if (firstElement == 2 && teiElementAdded != true) {
241
                                        //println "second tag: "+parser.getLocalName()
242
                                        if (localname != "teiHeader") { // teiHeader is missing
243
                                                writeTeiHeader();
244
                                                hasTeiHeader = true
245
                                                teiHeaderElementAdded = true
246
                                        }
247
                                } else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) {
248
                                        writer.writeStartElement(TEXT);
249
                                        writer.writeAttribute(ID, textname);
250
                                        textElementAdded = true;
251
                                        hasText = true;
252
                                }
253

    
254
                                super.processStartElement();
255
                                if (TEXT.equals(localname)) {
256
                                        if (!parser.getAttributeValue(null, ID)) {
257
                                                writer.writeAttribute(ID, textname);
258
                                        }
259
                                }
260
                }
261
        }
262

    
263
        protected void after()
264
        {
265
                if (textElementAdded) {
266
                        writer.writeEndElement(); // text
267
                }
268
                if (teiElementAdded) {
269
                        writer.writeEndElement(); // TEI
270
                }
271
                super.after(); // close writer, parser, etc
272
        }
273

    
274
        protected void addTEIElement()
275
        {
276
                writer.writeStartElement("TEI");
277
                writer.writeDefaultNamespace(TEINS);
278
                writer.writeNamespace(TXM, TXMNS);
279
                writer.writeNamespace(TEI, TEINS);
280
                writeTeiHeader();
281
        }
282

    
283
        protected void processCharacters()
284
        {
285
                if (flagWord) {
286
                        writer.writeCharacters(parser.getText().trim()); // keep form in 1 line
287
                } else {
288
                        super.processCharacters();
289
                }
290
        }
291
        
292
        boolean hasClassDecl = false;
293
        boolean hasFileDesc = false;
294
        boolean hasEncodingDesc = false;
295
        boolean hasTeiHeader = false;
296
        boolean hasTEI = false;
297
        public static String ANA = "ana"
298
        public static String RESP = "resp"
299
        public static String TYPE = "type"
300
        protected void processEndElement()
301
        {
302
                switch (parser.getLocalName()) {
303
                        case wtag:
304
                                writer.writeEndElement(); // txm:form
305
                                for (def values : anabalises)
306
                                {// <txm:ana resp=ref type=corresptype>value</txm:ana>
307
                                        writer.writeStartElement(TXMNS, ANA);
308
                                        writer.writeAttribute(RESP, values[0]);
309
                                        writer.writeAttribute(TYPE, values[1]);
310
                                        writer.writeCharacters(values[2]);
311
                                        writer.writeEndElement(); // txm:ana
312
                                }
313

    
314
                                flagWord = false;
315
                                break;
316

    
317
                        case "fileDesc":
318
                                hasFileDesc = true;
319
                                this.writeTXMResps();
320
                                break;
321

    
322
                        case "classDecl":
323
                                hasClassDecl=true;
324
                                this.writeTXMTaxonomies();
325
                                break;
326
                        case "encodingDesc":
327
                                hasEncodingDesc = true;
328
                                writeContentOfEncodingDesc();
329
                                break;
330

    
331
                        case "teiHeader":
332
                                hasTeiHeader = true
333
                                if (!hasEncodingDesc) {
334
                                        writer.writeStartElement("encodingDesc");
335
                                        writeContentOfEncodingDesc();
336
                                        writer.writeEndElement();
337
                                }
338

    
339
                                break;
340
                        case "TEI":
341
                                hasTEI = true;
342
                                if (!hasTeiHeader) {
343
                                        writeTeiHeader();
344
                                }
345
                                break;
346
                }
347

    
348
                super.processEndElement();
349
        }
350

    
351
        protected void writeTeiHeader()
352
        {
353
                writer.writeStartElement("teiHeader");
354
                writer.writeStartElement("fileDesc")
355
                this.writeTXMResps();
356
                writer.writeStartElement("titleStmt")
357
                writer.writeStartElement("title")
358
                writer.writeEndElement(); // title
359
                writer.writeEndElement(); // titleStmt
360
                writer.writeStartElement("publicationStmt")
361
                writer.writeEndElement(); // publicationStmt
362
                writer.writeStartElement("sourceDesc")
363
                writer.writeEndElement(); // sourceDesc
364
                writer.writeEndElement(); // fileDesc
365
                writer.writeStartElement("encodingDesc");
366
                writeContentOfEncodingDesc();
367
                writer.writeEndElement(); // encodingDesc
368
                writer.writeEndElement(); // teiHeader
369
        }
370

    
371
        protected void writeContentOfEncodingDesc()
372
        {
373
                writer.writeStartElement("appInfo")
374
                this.writeTXMApps();
375
                writer.writeEndElement(); // appInfo
376
                if (!hasClassDecl) {
377
                        writer.writeStartElement("classDecl");
378
                        this.writeTXMTaxonomies();
379
                        writer.writeEndElement(); // classDecl
380
                }
381
        }
382

    
383
        /**
384
         * Check resp.
385
         *
386
         * @return the string
387
         */
388
        public String checkResp()
389
        {
390
                String rez ="found tags : \n";
391
                for (String key : checkTags.keySet())
392
                        rez += "\t"+key+"\n";
393
                return rez;
394
        }
395

    
396
        /**
397
         * Sets the correspondances.
398
         *
399
         * @param correspRef the corresp ref
400
         * @param correspType the corresp type
401
         */
402
        public void setCorrespondances(correspRef, correspType)
403
        {
404
                this.correspRef = correspRef;
405
                this.correspType = correspType;
406
        }
407

    
408
        /**
409
         * Sets the header infos.
410
         *
411
         * @param respId the resp id
412
         * @param resps the resps
413
         * @param applications the applications
414
         * @param taxonomies the taxonomies
415
         * @param items the items
416
         */
417
        public void setHeaderInfos(respId,resps, applications, taxonomies, items)
418
        {
419
                this.respId = respId
420
                this.resps = resps
421
                this.applications = applications
422
                this.taxonomies = taxonomies;
423
                this.items = items;
424
        }
425

    
426
        /**
427
         * Write txm resps.
428
         */
429
        public void writeTXMResps()
430
        {
431
                for (String ref : respId) {
432
                        String[] infos = resps.get(ref);
433
                        writer.writeStartElement("respStmt");
434
                        writer.writeStartElement(RESP);
435
                        writer.writeAttribute(ID,ref);
436
                        writer.writeCharacters(infos[0]);
437
                        writer.writeStartElement("date");
438
                        writer.writeAttribute("when",infos[2]);
439
                        writer.writeCharacters(infos[3]);
440
                        writer.writeEndElement(); // date
441
                        writer.writeEndElement(); //resp
442
                        writer.writeStartElement("name");
443
                        writer.writeAttribute(TYPE, "person");
444
                        writer.writeCharacters(infos[1])
445
                        writer.writeEndElement(); // name
446
                        writer.writeEndElement(); //respStmt
447
                }
448
        }
449

    
450
        /**
451
         * Write txm apps.
452
         */
453
        public void writeTXMApps()
454
        {
455
                for (String ref : respId) {
456
                        List<String> list= applications.get(ref);
457
                        String ident = list.get(0);
458
                        String version = list.get(1);
459
                        File report = list.get(2);
460

    
461
                        writer.writeStartElement(TXMNS, "application");
462
                        writer.writeAttribute("ident", ident);
463
                        writer.writeAttribute("version", version);
464
                        writer.writeAttribute(RESP, ref);
465

    
466
                        //get txm:commandLine from GeneratedReport
467
                        if (report != null) {
468
                                writer.writeCharacters("");writer.flush();
469
                                Reader reader = new FileReader(report);
470
                                String line = reader.readLine();
471
                                while (line != null) {
472
                                        if (line.length() != 0)
473
                                                output.write(line+"\n");
474
                                        line = reader.readLine();
475
                                }
476
                                reader.close();
477
                        }
478

    
479
                        writer.writeStartElement("ab");
480
                        writer.writeAttribute(TYPE, "annotation");
481
                        for (String item : taxonomies.get(ref)) {
482
                                writer.writeStartElement("list");
483
                                writer.writeEmptyElement("ref");
484
                                writer.writeAttribute(TYPE, "tagset");
485
                                writer.writeAttribute("target", item);
486
                                writer.writeEndElement(); // list
487
                        }
488
                        writer.writeEndElement(); // ab
489
                        writer.writeEndElement(); // txm:application
490
                }
491
        }
492

    
493
        /**
494
         * Write txm taxonomies.
495
         */
496
        public void writeTXMTaxonomies()
497
        {
498
                for (String tax : items.keySet()) {
499
                        writer.writeStartElement("taxonomy");
500
                        writer.writeAttribute(ID, tax);
501

    
502
                        writer.writeStartElement("bibl");
503
                        writer.writeAttribute(TYPE, "tagset");
504
                        writer.writeStartElement("title");
505
                        writer.writeCharacters(tax);
506
                        writer.writeEndElement(); // title
507

    
508
                        for (String type : items.get(tax).keySet()) {
509
                                writer.writeEmptyElement("ref");
510
                                writer.writeAttribute(TYPE, type);
511
                                writer.writeAttribute("target", items.get(tax).get(type));
512
                        }
513
                        writer.writeEndElement(); // bibl
514
                        writer.writeEndElement(); // taxonomy
515
                }
516
        }
517

    
518
        /**
519
         * The main method.
520
         *
521
         * @param args the arguments
522
         */
523
        public static void main(String[] args) {
524

    
525
                String rootDir = "~/xml/rgaqcj/";
526
                new File(rootDir+"anainline/").mkdir();
527

    
528
                ArrayList<String> milestones = new ArrayList<String>();
529

    
530
                String file = "roland-p5.xml";
531
                String anafile = "roland-p5.xml";
532

    
533
                def correspType = new HashMap<String,String>()
534
                // correspType(attribut word wlx, attribut type de la propriété ana du w txm)
535
                correspType.put("p2","CATTEX2009");
536

    
537
                def correspRef = new HashMap<String,String>()
538
                // correspRef (attribut word wlx, attribut ref de la propriété ana du w txm. ref pointe vers l'identifiant du respStmt du TEIheader)
539
                correspRef.put("p2","ctx1");
540

    
541
                //il faut lister les id de tous les respStmt
542
                def respId = ["ctx1"];//,"TT1", "TnT1"];
543

    
544
                //fait la correspondance entre le respId et le rapport d'execution de l'outil
545
                def applications = new HashMap<String,HashMap<String,String>>();
546
                applications.put("ctx1",new ArrayList<String>());
547
                applications.get("ctx1").add("Oxygen");//app ident
548
                applications.get("ctx1").add("9.3");//app version
549
                applications.get("ctx1").add(null);//app report file path
550

    
551
                //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
552
                //pour construire les ref vers les taxonomies
553
                def taxonomiesUtilisees = new HashMap<String,String[]>();
554
                taxonomiesUtilisees.put("ctx1",["CATTEX2009"]);//,"lemma","lasla","grace"]);
555

    
556
                //associe un id d'item avec sa description et son URI
557
                def itemsURI = new HashMap<String,HashMap<String,String>>();
558
                itemsURI.put("CATTEX2009",new HashMap<String,String>());
559
                itemsURI.get("CATTEX2009").put("tagset","http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml");
560
                itemsURI.get("CATTEX2009").put("website","http://bfm.ens-lsh.fr/article.php3?id_article=176");
561

    
562
                //informations de respStmt
563
                //resps (respId <voir ci-dessus>, [description, person, date])
564
                def resps = new HashMap<String,String[]>();
565
                resps.put("ctx1", ["initial tagging","alavrentiev","2010-03-02","Tue Mar  2 21:02:55 Paris, Madrid 2010"])
566

    
567
                //lance le traitement
568
                def builder = new Xml2Ana(new File(rootDir+"/src/",file));
569
                builder.setCorrespondances(correspRef, correspType);
570
                builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
571
                //dossier de sortie + nom fichier sortie
572
                builder.process(anafile);
573

    
574
                return
575
        }
576

    
577
}