Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / Vocapia2Transcriber.groovy @ 3038

History | View | Annotate | Download (8.4 kB)

1
package org.txm.macro.transcription
2

    
3
import javax.xml.stream.*
4

    
5
import org.txm.importer.PersonalNamespaceContext
6

    
7
import java.io.BufferedOutputStream
8
import java.io.FileOutputStream
9
import java.net.URL
10
import java.util.regex.Pattern
11

    
12
class Vocapia2Transcriber {
13
        
14
        File xmlfile;
15
        protected BufferedOutputStream output;
16
        protected XMLStreamWriter writer;
17
        
18
        public Vocapia2Transcriber(File xmlfile) {
19
                
20
                this.xmlfile = xmlfile;
21
        }
22
        
23
        public boolean process(File outfile) {
24
                
25
                if (!xmlfile.exists()) return false;
26
                
27
                output = new BufferedOutputStream(new FileOutputStream(outfile), 16 * 1024);
28
                writer = XMLOutputFactory.newInstance().createXMLStreamWriter(output, "ISO-8859-1");// create a new file
29
                writer.setNamespaceContext(new PersonalNamespaceContext());
30
                
31
                URL url = xmlfile.toURI().toURL();
32
                String filename = outfile.getName()
33
                filename = filename.substring(0, filename.length()-4); // remove ".cqp"
34
                def inputData = url.openStream();
35
                XMLInputFactory factory = XMLInputFactory.newInstance();
36
                XMLStreamReader parser = factory.createXMLStreamReader(inputData);
37
                
38
                boolean flagWord = false
39
                def winfos = new LinkedHashMap()
40
                def turninfos = new LinkedHashMap()
41
                boolean other = false;
42
                String word = ""
43
                String duration = "0.0"
44
                try {
45
                        
46
                        for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
47
                                
48
                                switch (event) {
49
                                        case XMLStreamConstants.START_ELEMENT:
50
                                                String localname = parser.getLocalName()
51
                                                switch(localname) {
52
                                                        case "AudioDoc": // <AudioDoc name="xyz" path="xyz.flac"> -> <Trans scribe="see Proc elements" audio_filename="xyz.flac" version="see Proc elements" version_date="see Proc elements">
53
                                                        
54
                                                                writer.writeStartDocument("ISO-8859-1", "1.0")
55
                                                        
56
                                                                writer.writeStartElement("Trans")
57
                                                                writer.writeAttribute("audio_filename", parser.getAttributeValue(null, "path"))
58
                                                                break
59
                                                        
60
                                                        case "Proc": // <Proc name="scribe" version="date" editor="AAA"/>
61
                                                        //continue writing the "Trans" element
62
                                                                if ("scribe" == parser.getAttributeValue(null, "name")) {
63
                                                                        writer.writeAttribute("scribe", parser.getAttributeValue(null, "editor"))
64
                                                                        writer.writeAttribute("version", parser.getAttributeValue(null, "version"))
65
                                                                        writer.writeAttribute("version_date", parser.getAttributeValue(null, "version"))
66
                                                                }
67
                                                        
68
                                                                break;
69
                                                        
70
                                                        case "SpeakerList": // <SpeakerList> -> <Speakers>
71
                                                                writer.writeCharacters("\n") // after <Trans>
72
                                                                writer.writeStartElement("Speakers")
73
                                                                writer.writeCharacters("\n")
74
                                                        
75
                                                                break;
76
                                                        case "Speaker": // <Speaker ch="1" dur="531.38" gender="X" spkid="Enquêtrice" lang="fre" lconf="1.00" nw="1586" tconf="0.95"/> -> <Speaker id="spk1" name="enq4" check="no" dialect="native" accent="" scope="local"/>
77
                                                        
78
                                                                writer.writeStartElement("Speaker")
79
                                                                writer.writeAttribute("id", parser.getAttributeValue(null, "spkid"))
80
                                                                writer.writeAttribute("name", parser.getAttributeValue(null, "spkid"))
81
                                                                writer.writeAttribute("check", "")
82
                                                                writer.writeAttribute("dialect", parser.getAttributeValue(null, "lang"))
83
                                                                writer.writeAttribute("accent", parser.getAttributeValue(null, "gender"))
84
                                                                writer.writeAttribute("scope", "local")
85
                                                                writer.writeEndElement()
86
                                                                writer.writeCharacters("\n")
87
                                                                break;
88
                                                                
89
                                                        case "Channel":
90
                                                                duration = parser.getAttributeValue(null, "sigdur")
91
                                                                break;
92
                                                        
93
                                                        case "SegmentList":
94
                                                                writer.writeStartElement("Episode")
95
                                                        //<Section type="report" startTime="0" endTime="3617.593">
96
                                                                
97
                                                                writer.writeStartElement("Section")
98
                                                                writer.writeAttribute("startTime", "0.0")
99
                                                                writer.writeAttribute("endTime", duration)
100
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
101
                                                                        writer.writeAttribute(parser.getAttributeLocalName(i), parser.getAttributeValue(i))
102
                                                                }
103
                                                                break;
104
                                                        
105
                                                        case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1">
106
                                                                writer.writeStartElement("Turn")
107
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
108
                                                                        String name = parser.getAttributeLocalName(i)
109
                                                                        if (name == "stime") name = "startTime"
110
                                                                        else if (name == "etime") name = "endTime"
111
                                                                        else if (name == "spkid") name = "speaker"
112
                                                                        
113
                                                                        writer.writeAttribute(name, parser.getAttributeValue(i))
114
                                                                }
115
                                                        
116
                                                                writer.writeCharacters("\n")
117
                                                                writer.writeStartElement("Sync")
118
                                                                writer.writeAttribute("time", parser.getAttributeValue(null, "stime"))
119
                                                                writer.writeEndElement() // Sync
120
                                                                writer.writeCharacters("\n")
121
                                                                break;
122
                                                        case "Word":
123
                                                                flagWord = true
124
                                                                word = ""
125
                                                        
126
                                                        // store w infos in case the word must be splited
127
                                                                def endValue = String.format(Locale.US, "%.2f", (Double.parseDouble(parser.getAttributeValue(null, "stime")) + Double.parseDouble(parser.getAttributeValue(null, "dur"))))
128
                                                                def startValue = parser.getAttributeValue(null, "stime");
129
                                                                winfos.clear()
130
                                                                for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
131
                                                                        String name = parser.getAttributeLocalName(i)
132
                                                                        if (name == "stime") continue
133
                                                                        
134
                                                                        winfos[name] = parser.getAttributeValue(i)
135
                                                                }
136
                                                                winfos["time"] = startValue
137
                                                                winfos["start"] = startValue
138
                                                                winfos["end"] = endValue
139
                                                        
140
                                                        // write <w> at the end of <Word> tag
141
                                                                break
142
                                                }
143
                                                break;
144
                                        case XMLStreamConstants.END_ELEMENT:
145
                                                String localname = parser.getLocalName()
146
                                                switch(localname) {
147
                                                        case "AudioDoc": // <AudioDoc name="xyz" path="xyz.flac"> -> <Trans scribe="see Proc elements" audio_filename="xyz.flac" version="see Proc elements" version_date="see Proc elements">
148
                                                        
149
                                                                writer.writeEndElement() // Trans
150
                                                                break
151
                                                        
152
                                                        case "Proc": // <Proc name="scribe" version="date" editor="AAA"/>
153
                                                        
154
                                                                break
155
                                                        
156
                                                        case "SpeakerList": // <SpeakerList> -> <Speakers>
157
                                                        
158
                                                                writer.writeEndElement()
159
                                                                writer.writeCharacters("\n")
160
                                                                break
161
                                                        
162
                                                        case "Speaker": // <Speaker ch="1" dur="531.38" gender="X" spkid="Enquêtrice" lang="fre" lconf="1.00" nw="1586" tconf="0.95"/> -> <Speaker id="spk1" name="enq4" check="no" dialect="native" accent="" scope="local"/>
163
                                                        //already closed
164
                                                                break
165
                                                        
166
                                                        case "SegmentList":
167
                                                                writer.writeEndElement() // Section
168
                                                                writer.writeEndElement() // Episode
169
                                                                writer.writeCharacters("\n")
170
                                                                break
171
                                                        
172
                                                        case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1">
173
                                                                writer.writeEndElement() // Turn
174
                                                                writer.writeCharacters("\n")
175
                                                                break
176
                                                        
177
                                                        case "Word":
178
                                                                flagWord = false
179
                                                                word = word.trim()
180
                                                        
181
                                                        // split before the word
182
                                                                def puncts = []
183
                                                                                                                        
184
                                                                for (def punct : puncts) { // pre-retokenize if any
185
                                                                        writer.writeStartElement("w")
186
                                                                        for (String attr : winfos.keySet()) {
187
                                                                                writer.writeAttribute(attr, winfos[attr])
188
                                                                        }
189
                                                                        writer.writeCharacters(punct)
190
                                                                        writer.writeEndElement() // w
191
                                                                        writer.writeCharacters("\n")
192
                                                                }
193
                                                                                                                
194
                                                                if (word.length() > 0) {
195
                                                                        writer.writeStartElement("w") // start the initial word
196
                                                                        for (String attr : winfos.keySet()) {
197
                                                                                writer.writeAttribute(attr, winfos[attr])
198
                                                                        }
199
                                                                        writer.writeCharacters(word)
200
                                                                        writer.writeEndElement() // w
201
                                                                        writer.writeCharacters("\n")
202
                                                                }
203

    
204
                                                                break
205
                                                }
206
                                                break
207
                                        
208
                                        case XMLStreamConstants.CHARACTERS:
209
                                                if (flagWord) {
210
                                                        word += parser.getText()
211
                                                }
212
                                                break
213
                                }
214
                        }
215
                } catch (Exception e) {
216
                        println "Error while processing XML File "+xmlfile+": "
217
                        e.printStackTrace();
218
                        println "At: "+parser.getLocation();
219
                        println "See: "+outfile.getAbsolutePath()
220
                }
221
                
222
                output.flush()
223
                writer.close()
224
                output.close()
225
                //writer.close()
226
                parser.close()
227
                //println "$xmlfile -> $outfile"
228
                return true;
229
        }
230
        
231
        public static void main(String[] args) {
232
                File infile = new File("/home/mdecorde/xml/vocapia","test.xml")
233
                File outfile = new File("/home/mdecorde/xml/vocapia","test.trs")
234
                def processor = new Vocapia2Transcriber(infile)
235
                println processor.process(outfile, true)
236
        }
237
}