Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / TRSToTEI.groovy @ 1000

History | View | Annotate | Download (8.7 kB)

1
package org.txm.scripts.importer.transcriber
2

    
3
import java.io.File;
4

    
5
import javax.xml.stream.*;
6

    
7
import java.io.OutputStreamWriter;
8
import java.util.HashMap;
9
import java.util.LinkedHashMap;
10

    
11
import org.txm.importer.StaxIdentityParser;
12
import org.txm.importer.cwb.CwbEncode
13
import org.txm.importer.cwb.CwbMakeAll
14
import org.txm.importer.cwb.CwbProcess;
15
import org.txm.importer.cwb.PatchCwbRegistry;
16
import org.txm.utils.Pair;
17

    
18
class TRSToTEI extends StaxIdentityParser {
19
        
20
        // Transcription informations
21
        def informations = [:]
22
        def speakers = [:];
23
        HashMap<String, String> speakersname = new HashMap<String, String>();
24
        def topics = [:];
25

    
26
        List<String> localspeakers; // speakers of the current Turn
27

    
28
        // IDs
29
        int idturn = 1;
30
        int idsection = 1;
31
        int idu = 1;
32
        int idevent = 1;
33

    
34
        boolean uOpened = false;
35

    
36
        // values stored
37
        List<String> events = [];
38
        static int vEntityId = 0;
39
        static int vEntityIdCount = 1;
40
        String vSpeaker="";
41
        String u_name;
42
        boolean flagAna;
43
        boolean flagForm;
44
        boolean flagWord;
45
        String vWord="";
46
        String vForm="";
47
        String vAna="";
48
        String vEvents = "N/A";
49
        String vEntityType = "N/A"
50
        String wordid= "";
51
        String anatype = "";
52
        String anavalue = "";
53

    
54
        String formatedTime;
55
        String currentType;
56
        String lastTime = ""
57
        String textid;
58
        public TRSToTEI(File infile) {
59
                super(infile.toURI().toURL());
60
                textid = infile.getName()
61
                int idx = textid.indexOf(".trs")
62
                if (idx > 0) textid = textid.substring(0, idx)
63
        }
64

    
65
        boolean beforeBody = true;
66
        protected void processStartElement() {
67
                if (beforeBody) {
68
                        parseInfos()
69
                } else {
70
                        processEpisodeStartElement()
71
                }
72
        }
73

    
74
        protected void processEndElement() {
75
                if (beforeBody) {
76
                        parseInfosEndElements()
77
                } else {
78
                        processEpisodeEndElement()
79
                }
80
        }
81

    
82
        /**
83
         * Parses the infos.
84
         */
85
        private void parseInfos() { //until tag Episode
86
                //println "parse infos: $localname"
87
                switch (localname) {
88
                        case "Trans":
89
                                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
90
                                        informations.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
91
                                }
92
                                writer.writeStartElement("TEI");
93
                                writer.writeStartElement("teiHeader");
94
                                writer.writeStartElement("fileDesc");
95
                                writer.writeEndElement();
96
                                writer.writeStartElement("encodingDesc");
97
                                writer.writeEndElement();
98
                                writer.writeStartElement("profileDesc");
99
                                writer.writeEndElement();
100
                                writer.writeStartElement("application");
101
                                break;
102
                        case "Topics":
103
                                super.processStartElement();
104
                                break;
105
                        case "Topic":
106
                                super.processStartElement();
107
                                for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
108
                                        topics.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i));
109
                                }
110
                                break;
111
                        case "Speakers":
112
                                super.processStartElement();
113
                                break;
114
                        case "Speaker":
115
                                super.processStartElement();
116
                                String id = parser.getAttributeValue(null, "id");
117
                                String name = parser.getAttributeValue(null, "name");
118
                                if (id != null && name != null) {
119
                                        speakersname.put(id, name);
120
                                } else {
121
                                        println "found tag $localname with no id ($id) nor name ($name)"
122
                                        return;
123
                                }
124

    
125
                                if (id != null) {
126
                                        ArrayList list = new ArrayList<Pair<String, String>>()
127
                                        speakers.put(id, list);
128

    
129
                                        for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
130
                                                list.add(new Pair(parser.getAttributeLocalName(i), parser.getAttributeValue(i)));
131
                                        }
132
                                } else {
133
                                        println "found tag $localname with no id"
134
                                        return;
135
                                }
136
                                break;
137
                        case "Episode":
138
                                writer.writeEndElement() // application
139
                                writer.writeEndElement() // teiHeader
140
                                writer.writeStartElement("text")
141
                                writer.writeAttribute("id", textid)
142
                                for(def k : informations.keySet()) writer.writeAttribute(k, informations[k])
143
                                beforeBody = false; // end of info parsing
144
                                break;
145
                }
146
        }
147

    
148
        private void parseInfosEndElements() { //until tag Episode
149
                //println "parse infos: $localname"
150
                switch (localname) {
151
                        case "Topics":
152
                                super.processEndElement();
153
                                break;
154
                        case "Topic":
155
                                super.processEndElement();
156
                                break;
157
                        case "Speakers":
158
                                super.processEndElement();
159
                                break;
160
                        case "Speaker":
161
                                super.processEndElement();
162
                                break;
163
                }
164
        }
165

    
166
        
167
        boolean ignoreFirstSync = false // need to skip fist Sync when multiple locutors in Turn
168
        /**
169
         * Process.
170
         *
171
         * @param xmlfile the xmlfile
172
         * @return true, if successful
173
         */
174
        private boolean processEpisodeStartElement() {
175
                //println "parse start: $localname"
176
                switch(localname) {
177
                        case "Section": // >> div
178
                                testCloseU();
179
                                writer.writeStartElement("div")
180
                                writer.writeAttribute("id", Integer.toString(idsection++))
181
                                writeAttributes();
182
                                break;
183
                        case "Turn": // >> sp
184
                                testCloseU();
185
                                vSpeaker = parser.getAttributeValue(null, "speaker");
186
                                if (vSpeaker == null) { vSpeaker="N/A"        // no spk
187
                                } else {
188
                                        localspeakers = vSpeaker.split(" ")
189
                                        if (localspeakers.size() == 0) { // only one speaker
190
                                                //println "FOUND ONE SPEAKER"
191
                                                if (speakersname.containsKey(vSpeaker)) {
192
                                                        vSpeaker = speakersname.get(vSpeaker);
193
                                                }
194
                                        }
195
                                }
196

    
197
                                writer.writeStartElement("sp")
198
                                writer.writeAttribute("id", Integer.toString(idturn++))
199
                                ignoreFirstSync = vSpeaker.contains(" ") // need to skip fist Sync when multiple locutors in Turn
200
                                writer.writeAttribute("overlap", ""+ignoreFirstSync)
201
                                
202
                                String time = parser.getAttributeValue(null, "startTime");
203
                                formatedTime = formatTime(time)
204
                                writer.writeAttribute("time", formatedTime)
205
                                writeAttributes();
206
                                break;
207
                        case "Sync": // >> u
208
                                lastTime = parser.getAttributeValue(null, "time")
209
                                testCloseU();
210
                                if (ignoreFirstSync) { // need to skip fist Sync when multiple locutors in Turn
211
                                        ignoreFirstSync = false;
212
                                } else {
213
                                        writeU()
214
                                }
215
                                break;
216
                        case "Who": // >> u
217
                                testCloseU();
218
                                int n = Integer.parseInt(parser.getAttributeValue(null, "nb")) -1;
219
                                if (localspeakers.size() <= n || n < 0) {
220
                                        println "\nWarning: Mismatch speaker number declaration between <Who> and <Turn> tags at line "+parser.getLocation().getLineNumber()+" of the '"+inputurl.getFile()+"' transcription file.";
221
                                        vSpeaker = "#"+(n+1)+"?";
222
                                } else {
223
                                        vSpeaker = localspeakers.get(n);
224
                                }
225
                                writeU()
226
                                break;
227
                        case "Event": // >> event
228
                                writer.writeStartElement("event")
229
                                writer.writeAttribute("id", Integer.toString(idevent++))
230
                                writeAttributes();
231
                                break;
232
                        default:
233
                                super.processStartElement();
234
                }
235
        }
236

    
237
        void processEpisodeEndElement() {
238
                switch (localname) {
239
                        case "Section":
240
                                if (uOpened) {
241
                                        super.processEndElement(); // u
242
                                        uOpened = false;
243
                                }
244
                                super.processEndElement(); //div
245
                                break;
246
                        case "Turn":
247
                                if (uOpened) {
248
                                        super.processEndElement(); // u
249
                                        uOpened = false;
250
                                }
251
                                super.processEndElement(); // sp
252
                                break;
253
                        case "Sync":
254
                        case "Who":
255
                        case "Episode":
256
                                break
257
                        case "Trans":
258
                                writer.writeEndElement(); // text
259
                                writer.writeEndElement(); //TEI
260
                                break
261
                        default:
262
                                super.processEndElement();
263
                }
264
        }
265

    
266
        /**
267
         * If a 'u' tag is opened, close it.
268
         */
269
        private testCloseU() {
270
                if (uOpened) {
271
                        super.processEndElement(); // u
272
                        uOpened = false;
273
                }
274
        }
275

    
276
        /**
277
         * Write u.
278
         */
279
        private void writeU() {
280
                writer.writeStartElement("u")
281
                writer.writeAttribute("s", lastTime);
282
                writer.writeAttribute("time", formatTime(lastTime));
283
//                println "write u for vSpeaker=$vSpeaker"
284
//                println "getting spk name? ="+speakers.get(vSpeaker)
285
//                println "speakers: $speakers"
286
                def attributes = speakers.get(vSpeaker)
287
                if (attributes == null) { // in case of Who@n wrong number
288
                        if (vSpeaker.startsWith("#") && vSpeaker.endsWith("?")) { // don't show "N/A" vSpeaker
289
                                writer.writeAttribute("spk", vSpeaker)
290
                                writer.writeAttribute("spkid", vSpeaker)
291
                        }
292
                } else {
293
                        for (Pair p : attributes) {
294
//                                println " write attribute "+p.getFirst()+" "+p.getSecond()
295
                                String attrn = p.getFirst().toString();
296
                                if (attrn == "name") { // rename @name to @spk
297
                                        attrn = "spk"
298
                                        u_name = p.getSecond();
299
                                } else if (attrn == "id") { // rename @id to @spkid
300
                                        attrn = "spkid"
301
                                }
302

    
303
                                writer.writeAttribute(attrn,p.getSecond())
304
                        }
305
                }
306
                uOpened = true;
307
        }
308
        private String formatTime(String time) {
309
                try {
310
                        return formatTime(Float.parseFloat(time))
311
                } catch(Exception e) {return formatTime(0.0f)}
312
        }
313

    
314
        private String formatTime(float time) {
315
                String rez = " ";
316
                float h = time / 3600;
317
                time = time%3600;
318
                float min = (time%3600) / 60;
319
                int sec = (int)time%60
320
                if (min < 10)
321
                        rez = ""+(int)h+":0"+(int)min;//+":"+time%60;
322
                else
323
                        rez = ""+(int)h+":"+(int)min;//+":"+time%60;
324

    
325
                if (sec >= 10)
326
                        rez += ":"+sec;
327
                else
328
                        rez += ":0"+sec;
329

    
330
                return rez;
331
        }
332

    
333
        public static void main(String[] args) {
334
                File infile = new File("/home/mdecorde/xml/trs/int01.trs")
335
                File outfile = new File("/home/mdecorde/xml/trs/test-tei.xml")
336

    
337
                TRSToTEI p = new TRSToTEI(infile);
338
                p.process(outfile)
339
        }
340

    
341
}