Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / teitxm / Xml2Ana.groovy @ 724

History | View | Annotate | Download (15.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
// 
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
// 
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
// 
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
//
22
//
23
//
24
// $LastChangedDate: 2017-04-19 16:23:38 +0200 (mer. 19 avril 2017) $
25
// $LastChangedRevision: 3430 $
26
// $LastChangedBy: mdecorde $
27
//
28
package org.txm.scripts.teitxm;
29

    
30
import org.txm.importer.HasElement;
31
import org.txm.importer.StaxIdentityParser;
32

    
33
import java.text.DateFormat;
34
import java.util.ArrayList;
35
import java.util.Date;
36
import java.util.HashMap;
37
import java.util.Locale;
38

    
39
import javax.xml.stream.*;
40

    
41
import java.net.URL;
42

    
43
import org.txm.importer.filters.*;
44
import org.txm.utils.AsciiUtils;
45

    
46
/**
47
 * The Class Xml2Ana.
48
 *
49
 * @author mdecorde
50
 * transform : pre xml-tei file >> xml-tei-txm file
51
 * The pre xml-tei file must contains a minimal teiHeader with classDecl, encodingDesc and titleStmt
52
 * 
53
 * you must specify the correspondance between word attributs and ana types&respStmtIDs
54
 * then the attributes of w tags will be transformed into interp tag
55
 */
56
public class Xml2Ana extends StaxIdentityParser
57
{
58
        /** The dir. */
59
        private def dir;
60

    
61
        /** The convert all attributes. */
62
        private boolean convertAllAttributes = false;
63

    
64
        /** The corresp type. */
65
        HashMap<String,String> correspType;
66

    
67
        /** The corresp ref. */
68
        HashMap<String,String> correspRef;
69

    
70
        /** The check tags. */
71
        HashMap<String,Boolean> checkTags;
72

    
73
        /** The resp id. */
74
        def respId = [];
75

    
76
        /** The applications. */
77
        HashMap<String,File> applications;
78

    
79
        /** The taxonomies. */
80
        HashMap<String,String[]> taxonomies;
81

    
82
        /** The resps. */
83
        HashMap<String,String[]> resps;
84

    
85
        /** The items. */
86
        HashMap<String,HashMap<String,String>> items;
87

    
88
        /** The XML headeradded. */
89
        boolean XMLHeaderadded = false;
90
        String textname;
91
        String wtag = "w";
92
        
93
        public static final String TEXT = "text"
94
        public static final String ID = "id"
95

    
96
        /**
97
         * Instantiates a new xml2 ana.
98
         *
99
         * @param url the url
100
         * @param wordprefix the wordprefix
101
         */
102
        public Xml2Ana(File file) {
103
                super(file.toURI().toURL());
104
                //File file = new File(url.getFile()).getAbsoluteFile()
105
                textname = file.getName();
106
                int idx = textname.lastIndexOf(".");
107
                if (idx > 0)
108
                        textname = textname.substring(0, idx)
109

    
110
                checkTags = new HashMap<String,Boolean>();
111
                checkTags.put("respStmt",false);
112
                checkTags.put("titleStmt",false);
113
                checkTags.put("appInfo",false);
114

    
115
                hasText = new HasElement(file, TEXT).process();
116
        }
117

    
118
        /**
119
         * Sets the convert all atrtibutes.
120
         *
121
         * @param value the value
122
         * @return the java.lang. object
123
         */
124
        public setConvertAllAtrtibutes(boolean value) {
125
                convertAllAttributes = value;
126
        }
127
        
128
        /**
129
         * Sets the convert all atrtibutes.
130
         *
131
         * @param value the value
132
         * @return the java.lang. object
133
         */
134
        public setWordTag(String wtag) {
135
                this.wtag = wtag
136
        }
137

    
138
        int idcount = 0;
139
        boolean flagWord = false;
140
        int firstElement = 0;
141
        boolean teiElementAdded = false;
142
        boolean teiHeaderElementAdded = false;
143
        boolean hasText = false;
144
        boolean textElementAdded = false;
145
        def anabalises = [];
146
        protected void processStartElement()
147
        {
148
                println "checkTags=$checkTags";
149
                println "parser=$parser";
150
                firstElement++;
151
                if (checkTags.containsKey(parser.getLocalName())) {
152
                        checkTags.put(parser.getLocalName(), true);
153
                }
154
                
155
                switch (parser.getLocalName()) {
156
                        case wtag:
157
                                if (!hasText) {
158
                                        writer.writeStartElement(TEXT);
159
                                        writer.writeAttribute(ID, textname);
160
                                        textElementAdded = true;
161
                                        hasText = true;
162
                                }
163
                                idcount++; // increment word counter
164
                                anabalises.clear();
165

    
166
                                writer.writeStartElement(parser.getLocalName()); // write w
167

    
168
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) // write namespaces
169
                                        writer.writeNamespace(parser.getNamespacePrefix(i), parser.getNamespaceURI(i));
170

    
171
                                for (int i= 0 ; i < parser.getAttributeCount() ;i++ ) { // transform attributes
172
                                        String type = parser.getAttributeLocalName(i);
173
                                        String value = parser.getAttributeValue(i);
174
                                        if (correspType.containsKey(type)) { // check if txm:ana
175
                                                String corresptype = correspType.get(type);
176
                                                String ref = correspRef.get(type);
177
                                                anabalises.add(["#"+ref, "#"+corresptype, value]);
178
                                        } else if (type == ID) { // keep id attribute 
179
                                                String wordid = value
180
                                                if (wordid.startsWith("w")) {
181
                                                        if (!wordid.startsWith("w_"))
182
                                                                wordid = "w_"+wordid.substring(1)
183
                                                } 
184
//                                                else {
185
//                                                        wordid = "w_"+textname+"_"+wordid;
186
//                                                }
187
                                                
188
                                                wordid = AsciiUtils.buildAttributeId(wordid); // remove characters not compatible with the id attribute value
189
                                                
190
                                                writer.writeAttribute(type, wordid);
191
                                                
192
                                        } else { // add attributes that was in the original <w>
193
                                                if (convertAllAttributes)
194
                                                        anabalises.add(["none","#"+type, value])
195
                                                else
196
                                                        writer.writeAttribute(type, value);
197
                                        }
198
                                }
199

    
200
                                flagWord = true; // start to capture the form
201
                                writer.writeStartElement(TXMNS, "form");
202
                                break;
203

    
204
                        case "TEI":
205
                                super.processStartElement();
206
                                boolean hasTeiNS = false;
207
                                boolean hasTXMNs = false;
208
                                for (int i = 0 ; i < parser.getNamespaceCount() ; i++) {
209
                                        if (parser.getNamespaceURI(i) == TXMNS)
210
                                                hasTXMNs = true;
211
                                        else if (parser.getNamespaceURI(i) == TEINS)
212
                                                hasTeiNS = true;
213
                                }
214
                                if (!hasTeiNS) {
215
                                        writer.writeDefaultNamespace(TEINS);
216
                                }
217
                                if (!hasTXMNs)
218
                                        writer.writeNamespace(TXM, TXMNS);
219
                                break;
220

    
221
                        default:
222

    
223
                                if (TEXT.equals(localname)) {
224
                                        hasText = true;
225
                                }
226

    
227
                                if (firstElement == 1) { // test if first element is TEI
228
                                        //println "first tag: "+parser.getLocalName()
229
                                        if (localname != "TEI") { // "TEI" is missing
230
                                                teiElementAdded = true;
231
                                                addTEIElement();
232
                                        } else if (!hasText) {
233
                                                writer.writeStartElement(TEXT);
234
                                                writer.writeAttribute(ID, textname);
235
                                                textElementAdded = true;
236
                                                hasText = true;
237
                                        }
238
                                }
239
                                if (firstElement == 2 && teiElementAdded != true) {
240
                                        //println "second tag: "+parser.getLocalName()
241
                                        if (localname != "teiHeader") { // teiHeader is missing
242
                                                writeTeiHeader();
243
                                                hasTeiHeader = true
244
                                                teiHeaderElementAdded = true
245
                                        }
246
                                } else if (!hasText & (teiElementAdded | teiHeaderElementAdded)) {
247
                                        writer.writeStartElement(TEXT);
248
                                        writer.writeAttribute(ID, textname);
249
                                        textElementAdded = true;
250
                                        hasText = true;
251
                                }
252

    
253
                                super.processStartElement();
254
                                if (TEXT.equals(localname)) {
255
                                        if (!parser.getAttributeValue(null, ID)) {
256
                                                writer.writeAttribute(ID, textname);
257
                                        }
258
                                }
259
                }
260
        }
261

    
262
        protected void after()
263
        {
264
                if (textElementAdded) {
265
                        writer.writeEndElement(); // text
266
                }
267
                if (teiElementAdded) {
268
                        writer.writeEndElement(); // TEI
269
                }
270
                super.after(); // close writer, parser, etc
271
        }
272

    
273
        protected void addTEIElement()
274
        {
275
                writer.writeStartElement("TEI");
276
                writer.writeDefaultNamespace(TEINS);
277
                writer.writeNamespace(TXM, TXMNS);
278
                writer.writeNamespace(TEI, TEINS);
279
                writeTeiHeader();
280
        }
281

    
282
        protected void processCharacters()
283
        {
284
                if (flagWord) {
285
                        writer.writeCharacters(parser.getText().trim()); // keep form in 1 line
286
                } else {
287
                        super.processCharacters();
288
                }
289
        }
290
        
291
        boolean hasClassDecl = false;
292
        boolean hasFileDesc = false;
293
        boolean hasEncodingDesc = false;
294
        boolean hasTeiHeader = false;
295
        boolean hasTEI = false;
296
        public static String ANA = "ana"
297
        public static String RESP = "resp"
298
        public static String TYPE = "type"
299
        protected void processEndElement()
300
        {
301
                switch (parser.getLocalName()) {
302
                        case wtag:
303
                                writer.writeEndElement(); // txm:form
304
                                for (def values : anabalises)
305
                                {// <txm:ana resp=ref type=corresptype>value</txm:ana>
306
                                        writer.writeStartElement(TXMNS, ANA);
307
                                        writer.writeAttribute(RESP, values[0]);
308
                                        writer.writeAttribute(TYPE, values[1]);
309
                                        writer.writeCharacters(values[2]);
310
                                        writer.writeEndElement(); // txm:ana
311
                                }
312

    
313
                                flagWord = false;
314
                                break;
315

    
316
                        case "fileDesc":
317
                                hasFileDesc = true;
318
                                this.writeTXMResps();
319
                                break;
320

    
321
                        case "classDecl":
322
                                hasClassDecl=true;
323
                                this.writeTXMTaxonomies();
324
                                break;
325
                        case "encodingDesc":
326
                                hasEncodingDesc = true;
327
                                writeContentOfEncodingDesc();
328
                                break;
329

    
330
                        case "teiHeader":
331
                                hasTeiHeader = true
332
                                if (!hasEncodingDesc) {
333
                                        writer.writeStartElement("encodingDesc");
334
                                        writeContentOfEncodingDesc();
335
                                        writer.writeEndElement();
336
                                }
337

    
338
                                break;
339
                        case "TEI":
340
                                hasTEI = true;
341
                                if (!hasTeiHeader) {
342
                                        writeTeiHeader();
343
                                }
344
                                break;
345
                }
346

    
347
                super.processEndElement();
348
        }
349

    
350
        protected void writeTeiHeader()
351
        {
352
                writer.writeStartElement("teiHeader");
353
                writer.writeStartElement("fileDesc")
354
                this.writeTXMResps();
355
                writer.writeStartElement("titleStmt")
356
                writer.writeStartElement("title")
357
                writer.writeEndElement(); // title
358
                writer.writeEndElement(); // titleStmt
359
                writer.writeStartElement("publicationStmt")
360
                writer.writeEndElement(); // publicationStmt
361
                writer.writeStartElement("sourceDesc")
362
                writer.writeEndElement(); // sourceDesc
363
                writer.writeEndElement(); // fileDesc
364
                writer.writeStartElement("encodingDesc");
365
                writeContentOfEncodingDesc();
366
                writer.writeEndElement(); // encodingDesc
367
                writer.writeEndElement(); // teiHeader
368
        }
369

    
370
        protected void writeContentOfEncodingDesc()
371
        {
372
                writer.writeStartElement("appInfo")
373
                this.writeTXMApps();
374
                writer.writeEndElement(); // appInfo
375
                if (!hasClassDecl) {
376
                        writer.writeStartElement("classDecl");
377
                        this.writeTXMTaxonomies();
378
                        writer.writeEndElement(); // classDecl
379
                }
380
        }
381

    
382
        /**
383
         * Check resp.
384
         *
385
         * @return the string
386
         */
387
        public String checkResp()
388
        {
389
                String rez ="found tags : \n";
390
                for (String key : checkTags)
391
                        rez += "\t"+key+"\n";
392
                return rez;
393
        }
394

    
395
        /**
396
         * Sets the correspondances.
397
         *
398
         * @param correspRef the corresp ref
399
         * @param correspType the corresp type
400
         */
401
        public void setCorrespondances(correspRef, correspType)
402
        {
403
                this.correspRef = correspRef;
404
                this.correspType = correspType;
405
        }
406

    
407
        /**
408
         * Sets the header infos.
409
         *
410
         * @param respId the resp id
411
         * @param resps the resps
412
         * @param applications the applications
413
         * @param taxonomies the taxonomies
414
         * @param items the items
415
         */
416
        public void setHeaderInfos(respId,resps, applications, taxonomies, items)
417
        {
418
                this.respId = respId
419
                this.resps = resps
420
                this.applications = applications
421
                this.taxonomies = taxonomies;
422
                this.items = items;
423
        }
424

    
425
        /**
426
         * Write txm resps.
427
         */
428
        public void writeTXMResps()
429
        {
430
                for (String ref : respId) {
431
                        String[] infos = resps.get(ref);
432
                        writer.writeStartElement("respStmt");
433
                        writer.writeStartElement(RESP);
434
                        writer.writeAttribute(ID,ref);
435
                        writer.writeCharacters(infos[0]);
436
                        writer.writeStartElement("date");
437
                        writer.writeAttribute("when",infos[2]);
438
                        writer.writeCharacters(infos[3]);
439
                        writer.writeEndElement(); // date
440
                        writer.writeEndElement(); //resp
441
                        writer.writeStartElement("name");
442
                        writer.writeAttribute(TYPE, "person");
443
                        writer.writeCharacters(infos[1])
444
                        writer.writeEndElement(); // name
445
                        writer.writeEndElement(); //respStmt
446
                }
447
        }
448

    
449
        /**
450
         * Write txm apps.
451
         */
452
        public void writeTXMApps()
453
        {
454
                for (String ref : respId) {
455
                        List<String> list= applications.get(ref);
456
                        String ident = list.get(0);
457
                        String version = list.get(1);
458
                        File report = list.get(2);
459

    
460
                        writer.writeStartElement(TXMNS, "application");
461
                        writer.writeAttribute("ident", ident);
462
                        writer.writeAttribute("version", version);
463
                        writer.writeAttribute(RESP, ref);
464

    
465
                        //get txm:commandLine from GeneratedReport
466
                        if (report != null) {
467
                                writer.writeCharacters("");writer.flush();
468
                                Reader reader = new FileReader(report);
469
                                String line = reader.readLine();
470
                                while (line != null) {
471
                                        if (line.length() != 0)
472
                                                output.write(line+"\n");
473
                                        line = reader.readLine();
474
                                }
475
                                reader.close();
476
                        }
477

    
478
                        writer.writeStartElement("ab");
479
                        writer.writeAttribute(TYPE, "annotation");
480
                        for (String item : taxonomies.get(ref)) {
481
                                writer.writeStartElement("list");
482
                                writer.writeEmptyElement("ref");
483
                                writer.writeAttribute(TYPE, "tagset");
484
                                writer.writeAttribute("target", item);
485
                                writer.writeEndElement(); // list
486
                        }
487
                        writer.writeEndElement(); // ab
488
                        writer.writeEndElement(); // txm:application
489
                }
490
        }
491

    
492
        /**
493
         * Write txm taxonomies.
494
         */
495
        public void writeTXMTaxonomies()
496
        {
497
                for (String tax : items.keySet()) {
498
                        writer.writeStartElement("taxonomy");
499
                        writer.writeAttribute(ID, tax);
500

    
501
                        writer.writeStartElement("bibl");
502
                        writer.writeAttribute(TYPE, "tagset");
503
                        writer.writeStartElement("title");
504
                        writer.writeCharacters(tax);
505
                        writer.writeEndElement(); // title
506

    
507
                        for (String type : items.get(tax).keySet()) {
508
                                writer.writeEmptyElement("ref");
509
                                writer.writeAttribute(TYPE, type);
510
                                writer.writeAttribute("target", items.get(tax).get(type));
511
                        }
512
                        writer.writeEndElement(); // bibl
513
                        writer.writeEndElement(); // taxonomy
514
                }
515
        }
516

    
517
        /**
518
         * The main method.
519
         *
520
         * @param args the arguments
521
         */
522
        public static void main(String[] args) {
523

    
524
                String rootDir = "~/xml/rgaqcj/";
525
                new File(rootDir+"anainline/").mkdir();
526

    
527
                ArrayList<String> milestones = new ArrayList<String>();
528

    
529
                String file = "roland-p5.xml";
530
                String anafile = "roland-p5.xml";
531

    
532
                def correspType = new HashMap<String,String>()
533
                // correspType(attribut word wlx, attribut type de la propriété ana du w txm)
534
                correspType.put("p2","CATTEX2009");
535

    
536
                def correspRef = new HashMap<String,String>()
537
                // correspRef (attribut word wlx, attribut ref de la propriété ana du w txm. ref pointe vers l'identifiant du respStmt du TEIheader)
538
                correspRef.put("p2","ctx1");
539

    
540
                //il faut lister les id de tous les respStmt
541
                def respId = ["ctx1"];//,"TT1", "TnT1"];
542

    
543
                //fait la correspondance entre le respId et le rapport d'execution de l'outil
544
                def applications = new HashMap<String,HashMap<String,String>>();
545
                applications.put("ctx1",new ArrayList<String>());
546
                applications.get("ctx1").add("Oxygen");//app ident
547
                applications.get("ctx1").add("9.3");//app version
548
                applications.get("ctx1").add(null);//app report file path
549

    
550
                //fait la correspondance entre le respId et les attributs type de la propriété ana du w txm
551
                //pour construire les ref vers les taxonomies
552
                def taxonomiesUtilisees = new HashMap<String,String[]>();
553
                taxonomiesUtilisees.put("ctx1",["CATTEX2009"]);//,"lemma","lasla","grace"]);
554

    
555
                //associe un id d'item avec sa description et son URI
556
                def itemsURI = new HashMap<String,HashMap<String,String>>();
557
                itemsURI.put("CATTEX2009",new HashMap<String,String>());
558
                itemsURI.get("CATTEX2009").put("tagset","http://bfm.ens-lsh.fr/IMG/xml/cattex2009.xml");
559
                itemsURI.get("CATTEX2009").put("website","http://bfm.ens-lsh.fr/article.php3?id_article=176");
560

    
561
                //informations de respStmt
562
                //resps (respId <voir ci-dessus>, [description, person, date])
563
                def resps = new HashMap<String,String[]>();
564
                resps.put("ctx1", ["initial tagging","alavrentiev","2010-03-02","Tue Mar  2 21:02:55 Paris, Madrid 2010"])
565

    
566
                //lance le traitement
567
                def builder = new Xml2Ana(new File(rootDir+"/src/",file));
568
                builder.setCorrespondances(correspRef, correspType);
569
                builder.setHeaderInfos(respId,resps, applications, taxonomiesUtilisees, itemsURI)
570
                //dossier de sortie + nom fichier sortie
571
                builder.process(anafile);
572

    
573
                return
574
        }
575

    
576
}