Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / projects / nov13 / CreateTheOtherTurns.groovy @ 3038

History | View | Annotate | Download (7.9 kB)

1
package org.txm.macro.projects.nov13
2

    
3
import javax.xml.stream.*
4

    
5
import org.txm.importer.PersonalNamespaceContext
6
import org.txm.utils.FileUtils
7
import org.txm.xml.IdentityHook
8
import org.txm.xml.*
9

    
10
import java.io.BufferedOutputStream
11
import java.io.FileOutputStream
12
import java.io.IOException
13
import java.net.URL
14
import java.util.*
15
import java.util.Map.Entry
16
import java.util.regex.Pattern
17

    
18
class CreateTheOtherTurns extends XMLProcessor {
19
        
20
        LocalNamesHookActivator activator;
21
        IdentityHook hook;
22
        
23
        def primarySpeakerIdRegex
24
        String primarySpeakerId
25
        
26
        Boolean debug
27
        
28
        String otherNonPrimarySpeakerId = "other"
29
        
30
        public CreateTheOtherTurns(File xmlfile, String primarySpeakerIdRegexString, String otherNonPrimarySpeakerId, Boolean debug) {
31
                super(xmlfile)
32
                this.debug = debug
33
                
34
                this.otherNonPrimarySpeakerId = otherNonPrimarySpeakerId
35
                if (primarySpeakerIdRegexString != null && primarySpeakerIdRegexString.length() > 0) {
36
                        String id = FileUtils.stripExtension(xmlfile)
37
                        
38
                        this.primarySpeakerIdRegex = /$primarySpeakerIdRegexString/
39
                        
40
                        def rez = (id =~ primarySpeakerIdRegex).findAll()
41
                        def rez2 = (id =~ /$primarySpeakerIdRegex/).findAll()
42
                        if (rez2.size() != 1) {
43
                                if (debug) println "WARNING: found the ${rez2.size()} matches of primary speaker prefix in the '$id' file name"
44
                                this.primarySpeakerIdRegex = null
45
                        } else {
46
                                primarySpeakerId = rez[0]
47
                                //if (debug) println "Detected primary speaker: $primarySpeakerId"
48
                        }
49
                }
50
                
51
                activator = new LocalNamesHookActivator<>(hook, ["Speaker", "w", "Turn", "Sync"]);
52
                
53
                hook = new IdentityHook("word_hook", activator, this) {
54
                                        
55
                                        boolean inTurn = false;
56
                                        
57
                                        boolean inW = false;
58
                                        StringBuilder wordBuffer = new StringBuilder();
59
                                        
60
                                        String currentTime;
61
                                        LinkedHashMap turnInfos = new LinkedHashMap()
62
                                        LinkedHashMap wInfos = new LinkedHashMap()
63
                                        boolean other
64
                                        
65
                                        @Override
66
                                        public boolean deactivate() {
67
                                                return true;
68
                                        }
69
                                        
70
                                        @Override
71
                                        public boolean _activate() {
72
                                                return true;
73
                                        }
74
                                        
75
                                        @Override
76
                                        protected void processStartElement() throws XMLStreamException, IOException {
77
                                                if (localname.equals("Speaker")) { // find out the main speaker
78
                                                        String id = parser.getAttributeValue(null, "id")// id
79
                                                        if (id ==~ primarySpeakerIdRegex) {
80
                                                                primarySpeakerId = id
81
                                                        }
82
                                                        super.processStartElement();
83
                                                } else if (localname.equals("Turn")) {
84
                                                        // store values
85
                                                        inTurn = true;
86
                                                        turnInfos.clear()
87
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
88
                                                                turnInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
89
                                                        }
90
                                                        currentTime = turnInfos["startTime"]
91
                                                        super.processStartElement();
92
                                                } else if (localname.equals("Sync")) {
93
                                                        currentTime = parser.getAttributeValue(null, "time")
94
                                                        super.processStartElement();
95
                                                } else if (localname.equals("w")) {
96
                                                        // store values
97
                                                        inW = true;
98
                                                        wInfos.clear()
99
                                                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
100
                                                                wInfos[parser.getAttributeLocalName(i)] = parser.getAttributeValue(i)
101
                                                        }
102
                                                        String time = parser.getAttributeValue(null, "time")
103
                                                        if (time != null && time.length() > 0) {
104
                                                                currentTime = time
105
                                                        }
106
                                                        wordBuffer.setLength(0);
107
                                                        return; // write w later
108
                                                }
109
                                                else {
110
                                                        super.processStartElement();
111
                                                }
112
                                        }
113
                                        
114
                                        @Override
115
                                        protected void processCharacters() throws XMLStreamException {
116
                                                if (inW) {
117
                                                        wordBuffer.append(parser.getText())
118
                                                }
119
                                                else {
120
                                                        super.processCharacters();
121
                                                }
122
                                        }
123

    
124
                                        protected void writeWord(String word) {
125
                                                writer.writeStartElement("w") // start the initial word
126
                                                for (String attr : wInfos.keySet() ) {
127
                                                        writer.writeAttribute(attr, wInfos[attr])
128
                                                }
129
                                                writer.writeCharacters(word)
130
                                                writer.writeEndElement() // w
131
                                        }
132
                                                                                
133
                                        def startOtherReg = /^(.*)\*([^ ]+.*)$/
134
                                        def endOtherReg = /^(.*[^ ]+)\*(.*)$/
135
                                        String previousOtherStarting = "<none>"
136
                                        @Override
137
                                        protected void processEndElement() throws XMLStreamException {
138
                                                if (localname.equals("w")) {
139
                                                        
140
                                                        inW = false
141
                                                        String word = wordBuffer.toString().trim()
142
                                                        def m1 = word =~ startOtherReg
143
                                                        
144
                                                        if (m1.matches()) {
145
                                                                if (debug) println "OPEN OTHER at $word"
146
                                                                if (other) {
147
                                                                        println "Warning: found a starting * when one 'other' is already started at "+getLocation()
148
                                                                        println "Previous starting 'other' at "+previousOtherStarting
149
                                                                } else {
150
                                                                        //close current Turn and start a 'other' Turn
151
                                                                        previousOtherStarting = ["word="+word+ " location="+getLocation()]
152
                                                                        String group1 = m1.group(1)
153
                                                                        if (group1.length() > 0) {
154
                                                                                writeWord(group1)
155
                                                                                writer.writeCharacters("\n")
156
                                                                        }
157
                                                                        
158
                                                                        writer.writeEndElement() // current Turn
159
                                                                        writer.writeCharacters("\n")
160
                                                                        
161
                                                                        def tmpInfos = new LinkedHashMap()
162
                                                                        for (String attr : turnInfos.keySet()) tmpInfos[attr] = turnInfos[attr]
163
                                                                        tmpInfos["orig-speaker"] = turnInfos["speaker"]
164
                                                                        
165
                                                                        if (primarySpeakerIdRegex == null || turnInfos["speaker"] ==~ primarySpeakerIdRegex) { // the current speaker is not the primary speaker
166
                                                                                tmpInfos["speaker"] = otherNonPrimarySpeakerId
167
                                                                        } else {
168
                                                                                tmpInfos["speaker"] = primarySpeakerId
169
                                                                        }
170
                                                                        tmpInfos["startTime"] = currentTime
171
                                                                        writer.writeStartElement("Turn")
172
                                                                        for (String attr : tmpInfos.keySet()) {
173
                                                                                writer.writeAttribute(attr, tmpInfos[attr])
174
                                                                        }
175
                                                                        writer.writeCharacters("\n")
176
                                                                        
177
                                                                        other = true
178
                                                                        word = m1.group(2)
179
                                                                }
180
                                                        }
181
                                                        
182
                                                        boolean shouldCloseOtherTurn = false;
183
                                                        def m2 = word =~ endOtherReg
184
                                                        if (m2.matches()) {
185
                                                                if (debug) println "DETECT END OTHER at $word"
186
                                                                previousOtherStarting = ["word="+word+ " location="+getLocation()]
187
                                                                if (other) {
188
                                                                        shouldCloseOtherTurn = true;
189
                                                                        
190
                                                                        word = m2.group(1)
191
                                                                        other = false
192
                                                                } else {
193
                                                                        println "Warning: found a ending * when one 'other' is not started at "+getLocation()
194
                                                                        println "Previous closing 'other' Turn at "+previousOtherStarting
195
                                                                }
196
                                                        }
197
                                                        
198
//                                                        if ("XXX".equals(word)) { // <Event desc="XXX" type="unknown" extent="next"/>
199
//                                                                writer.writeStartElement("event") // start the initial word
200
//                                                                writer.writeAttribute("desc", "XXX from "+wInfos["start"] + " to "+wInfos["end"])
201
//                                                                writer.writeAttribute("type", "unknown")
202
//                                                                writer.writeAttribute("extent", "instantaneous")
203
//                                                                writer.writeEndElement() // event
204
//                                                                word = "" // don't write the word
205
//                                                        }
206
                                                        
207
                                                        if (word.length() > 0) {
208
                                                                writeWord(word)
209
                                                        }
210
                                                        
211
                                                        if (shouldCloseOtherTurn) {
212
                                                                if (debug) println "CLOSE OTHER at $word"
213
                                                                shouldCloseOtherTurn = false;
214
                                                                //close the current 'other' Turn and restart the actual Turn
215
                                                                writer.writeCharacters("\n")
216
                                                                writer.writeEndElement() // current 'other' Turn
217
                                                                writer.writeCharacters("\n")
218
                                                                
219
                                                                writer.writeStartElement("Turn") // rebuild the orig Turn and fix its start-end infos
220
                                                                turnInfos["startTime"] = wInfos["end"] // fix the startTime using the current word end time
221
                                                                for (String attr : turnInfos.keySet()) {
222
                                                                        writer.writeAttribute(attr, turnInfos[attr])
223
                                                                }
224
                                                                writer.writeCharacters("\n")
225
                                                                
226
                                                                if (m2.group(2).length() > 0) {
227
                                                                        writeWord(m2.group(2))
228
                                                                }
229
                                                                
230
                                                                other = false
231
                                                        }
232
                                                } else {
233
                                                        super.processEndElement();
234
                                                }
235
                                        }
236
                                }
237
        }
238
        
239
        public static void main(String[] args) {
240
                File infile = new File("/home/mdecorde/xml/vocapia","test.trs")
241
                File outfile = new File("/home/mdecorde/xml/vocapia","test-fixed.trs")
242
                def processor = new FixTranscription(infile, true)
243
                println processor.process(outfile)
244
        }
245
}