Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macro / transcription / TXT2TRS.groovy @ 187

History | View | Annotate | Download (10.9 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import java.text.DecimalFormat
25
import java.text.SimpleDateFormat
26

    
27
import javax.xml.stream.*
28

    
29
import org.txm.doc.*
30
import org.txm.importer.*
31
import org.txm.importer.filters.*
32

    
33
class TXT2TRS {
34

    
35
        boolean inTurn = false
36
        def locs = new HashSet<String>()
37
        def lastTiming = "none"
38
        String localname = null
39
        String allTxt
40
        String header = ""
41
        String type = ""
42
        boolean startOfP = false
43
        boolean firstTurn = true
44
        boolean firstSync = true
45
        def loc = null
46
        int time = 0;
47
        boolean start= false;
48
        boolean startTranscription=false
49
        boolean isThemeOpened = false
50

    
51
        def formater
52
        def formater2
53
        StaxStackWriter pagedWriter
54
        
55
        
56
        //                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>)/
57
        def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>)/
58
        def turn_with_bullet = /.+(\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>))/
59
        //                  /([a-zA-Z]+)="([^"]+)"/
60
        def section_regex = /([a-zA-Z]+)="([^"]+)"/
61
        
62
        public TXT2TRS() { }
63

    
64
        public boolean process(File txtFile, File trsFile) {
65

    
66
                def lines = txtFile.readLines("UTF-8")
67

    
68
                formater = new SimpleDateFormat("h:mm:ss.S");
69
                formater.setTimeZone(TimeZone.getTimeZone("GMT"));
70
                formater2 = new DecimalFormat("#######.0")
71
                formater2.setMaximumFractionDigits(2)
72

    
73
                //println "Start writing TRS file..."
74
                pagedWriter = new StaxStackWriter(trsFile, "UTF-8");
75
                pagedWriter.writeStartDocument("UTF-8", "1.0")
76
                //pagedWriter.writeDTD("<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">") // removed because parser has the "no DTD" option set
77
                pagedWriter.writeStartElement("Trans");
78
                pagedWriter.writeAttribute("scribe", "SCRIBE");
79
                pagedWriter.writeAttribute("audio_filename", "");
80
                pagedWriter.writeAttribute("version", "1.0");
81
                pagedWriter.writeAttribute("version_date", "110110");
82
                pagedWriter.writeStartElement("Speakers");
83
                pagedWriter.writeEndElement(); // Speakers
84
                
85
                pagedWriter.writeStartElement("Episode");
86
                pagedWriter.writeAttribute("program", "NA");
87
                pagedWriter.writeAttribute("air_date", "NA");
88
                isThemeOpened = false
89
                def firstTurn = true
90

    
91
                // test if the transcription structure is : comment, start line, end line
92
                def trslines = []
93
                boolean started = false
94
                for (String line : lines) {
95
                        line = line.trim()
96
                        if (line.matches("\\(.+:..:..\\..+\\)")) {
97
                                started = true;
98
                        }
99

    
100
                        if (started) { // get ttranscription lines
101
                                if (line.length() > 0) trslines << line
102
                        }
103
                }
104

    
105
                if (!started) {
106
                        println "The $txtFile file does not contain the first time bullet"
107
                        return false;
108
                }
109
                String lastLine = trslines[-1]
110
                if (!lastLine.matches("\\(.+:..:..\\..+\\)")) {
111
                        // check if last turn has a bullet at the end
112
                        boolean bulletIsMissing = true;
113
                        for (int i = trslines.size() - 1 ; i > 0 ; i--) {
114
                                String line = trslines[i].trim()
115
                                def split = line.split("\t", 2)
116
                                //println "$line "+split.length+ " VS "+turn_with_bullet
117
                                if (split.length == 2) {
118
                                         if (line.matches(turn_with_bullet)) {
119
                                                bulletIsMissing = false;
120
                                        } 
121
                                        println "END $bulletIsMissing"
122
                                        i=-1
123
                                }
124
                        }
125
                        if (bulletIsMissing) {
126
                                println "Error: The $txtFile file last line or last turn does not contain the end time bullet: $bullet_regex"
127
                                return false;
128
                        }
129
                }
130

    
131
                lines = trslines
132

    
133
                def currentTime = lines[0] // get starting time
134

    
135
                // group lines per Turn, Event, Comment...
136
                def groups = [["sync", lines[0]]]
137
                lines.remove(0)
138
                def currentGroup = []
139

    
140
                for (String line : lines) {
141
                        if (line.indexOf("\t") > 0) { // start of Turn
142
                                def split = line.split("\t", 2)
143
                                def loc = split[0]
144
                                def content = split[1]
145

    
146
                                if (currentGroup.size() > 0) {groups << currentGroup} // store previous group
147
                                currentGroup = ["turn", loc, content];
148

    
149
                        } else if (line.matches("(\\([^)]+\\) *)+")) {//("(") && line.endsWith(")")) { // comment or timing
150
                                if (line.matches(bullet_regex)) { // Sync in a Turn
151
                                        if (currentGroup.size() > 0) {
152
                                                currentGroup << line
153
                                        } else {
154
                                                if (currentGroup.size() > 0) {groups << currentGroup} // store previous group
155
                                                groups << ["sync", line]
156
                                        }
157
                                } else { // comment -> ignored
158
                                        if (currentGroup.size() > 0) {groups << currentGroup} // store previous group
159
                                        currentGroup = ["comment", line];
160
                                }
161
                        } else if (line.startsWith("[") && line.endsWith("]")) { // section, end of Turn
162
                                if (currentGroup.size() > 0) groups << currentGroup
163
                                groups << ["section", line]
164
                        } else { // something that happens during a Turn
165
                                currentGroup << line
166
                                
167
                        }
168
                }
169

    
170
                for (def g : groups) {
171
                        //println g
172
                        if (g[0] == "turn") {
173
                                String loc = g[1]
174
                                //println "process turn: "+g
175
                                def tokens = []
176
                                for (int i = 2 ; i < g.size() ; i++) {// speach starts for position 2
177
                                        //println " process speach: "+g[i]
178
                                        //         tokenize time bullets, comment and speach
179

    
180

    
181
                                        def turntokens = []
182
                                        def toProcess = [g[i]]
183
                                        def lastMatch = 0
184
                                        // find Sync
185
                                        while (toProcess.size() > 0) {
186
                                                String txt = toProcess.pop()
187
                                                boolean nomatch = true
188
                                                def m = (txt =~ bullet_regex) // time
189
                                                //if (m.size() > 0) {
190
                                                while (m.find()) {
191
                                                        nomatch = false
192
                                                        //println "found: "+m.group()
193
                                                        // grab the string in between the end of the last match
194
                                                        // and the start of the current one (empty string if none)
195
                                                        String before = txt.substring(lastMatch, m.start())
196
                                                        if (before.length() > 0) turntokens << before
197
                                                        // grab the delimiter
198
                                                        turntokens << m.group()
199
                                                        // keep looking from the end of the current match
200
                                                        lastMatch = m.end()
201
                                                }
202
                                                if (nomatch) turntokens << txt
203
                                                if (lastMatch > 0 && lastMatch < txt.length()) turntokens << txt.substring(lastMatch)
204
                                        }
205
                                        //println "  speach+syncs="+turntokens
206

    
207
                                        // find Comments (....)
208
                                        toProcess = turntokens
209
                                        turntokens = []
210
                                        
211
                                        while (toProcess.size() > 0) {
212
                                                String txt = toProcess.remove(0)
213
                                                
214
                                                boolean nomatch = true
215
                                                def m = (txt =~ /(\([^)]+\))/) // time
216
                                                //if (m.size() > 0) {
217
                                                lastMatch = 0
218
                                                while (m.find()) {
219
                                                        nomatch = false
220
                                                        //println "found: "+m.group()
221
                                                        // grab the string in between the end of the last match
222
                                                        // and the start of the current one (empty string if none)
223
                                                        String before = txt.substring(lastMatch, m.start())
224
                                                        if (before.length() > 0) turntokens << before
225
                                                        // grab the delimiter
226
                                                        turntokens << m.group()
227
                                                        // keep looking from the end of the current match
228
                                                        lastMatch = m.end()
229
                                                }
230
                                                if (nomatch) turntokens << txt
231
                                                if (lastMatch > 0 && lastMatch < txt.length()) turntokens << txt.substring(lastMatch)
232
                                        }
233
                                        //println "  speach+syncs+comments="+turntokens
234
                                        //                                        if (!processCommentAndSync(g[i])) {
235
                                        //                                                println "ERROR $lastTiming 'contains ([]) but malformed' error line "+parser.getLocation().getLineNumber()+" : $allTxt"
236
                                        //                                        }
237
                                        tokens.addAll(turntokens)
238
                                }
239

    
240
                                if (firstTurn && !isThemeOpened) {
241
                                        isThemeOpened = true;
242
                                        
243
                                        pagedWriter.writeStartElement("Section");
244
                                        pagedWriter.writeAttribute("type", "report");
245
                                        pagedWriter.writeAttribute("topic", "");
246
                                        pagedWriter.writeAttribute("startTime", "0.0");
247
                                        pagedWriter.writeAttribute("endTime", "");
248
                                }
249
                                firstTurn = false
250
                                pagedWriter.writeCharacters("\n")
251
                                pagedWriter.writeStartElement("Turn");
252
                                pagedWriter.writeAttribute("speaker", loc);
253
                                pagedWriter.writeAttribute("startTime", "");
254
                                pagedWriter.writeAttribute("endTime", "");
255
                                pagedWriter.writeCharacters("\n")
256
                                writeSync(currentTime)
257

    
258
                                String previousToken = ""
259
                                for (def t : tokens) {
260
                                        if (t.matches(bullet_regex)) {
261
                                                writeSync(t)
262
                                                currentTime = t
263
                                        }  else if (t.startsWith("(") && t.endsWith(")") ){
264
                                                pagedWriter.writeCharacters("\n")
265
                                                pagedWriter.writeEmptyElement("Comment")
266
                                                pagedWriter.writeAttribute("desc", t.substring(1, t.length() -1))
267
                                        }else {
268
                                                pagedWriter.writeCharacters(" "+t)
269
                                        }
270
                                }
271

    
272
                                pagedWriter.writeCharacters("\n")
273
                                pagedWriter.writeEndElement(); // Turn
274
                        } else if (g[0] == "sync") {
275
                                currentTime = g[1]
276
                                writeSync(currentTime)
277
                        } else if (g[0] == "comment") {
278
//                                println "process comment: "+g[1]
279
                                pagedWriter.writeCharacters("\n")
280
                                pagedWriter.writeEmptyElement("Comment")
281
                                pagedWriter.writeAttribute("desc", g[1].substring(1, g[1].length() -1))
282
                        } else if (g[0] == "section") {
283
//                                println "process section: "+g[1]
284
                                
285
                                def m = (g[1] =~ section_regex) // time
286
                                if (m.size() > 0) {
287
                                        if (isThemeOpened) {
288
                                                pagedWriter.writeCharacters("\n")
289
                                                pagedWriter.writeEndElement(); // previous Section
290
                                        }
291
                                        isThemeOpened = true;
292
                                        pagedWriter.writeCharacters("\n")
293
                                        pagedWriter.writeStartElement("Section");
294
                                        def attr = ["type":"report", "topic":"", "startTime":"", "endTime":""]
295
                                        
296
                                        for (def match : m) {
297
                                                if (match.size() == 3) {
298
                                                        attr[match[1]]=match[2];
299
                                                } else {
300
                                                        println "ERROR $lastTiming malformed section attribute "+g[1]+ "near $currentTime"
301
                                                }
302
                                        }
303
                                        
304
                                        // write attributes
305
                                        for (def k : attr.keySet())        pagedWriter.writeAttribute(k, attr[k]);
306
                                } else {
307
                                        println "ERROR $lastTiming malformed section "+g[1]
308
                                }
309
                        } else {
310
                                println "Error: found malformed line: "+g
311
                        }
312
                }
313

    
314
                boolean ret = true
315
                if (isThemeOpened) { // close previous theme
316
                        pagedWriter.writeCharacters("\n")
317
                        pagedWriter.writeEndElement(); // Section
318
                }
319

    
320
                //pagedWriter.writeEndElement(); // Section
321
                pagedWriter.writeCharacters("\n")
322
                pagedWriter.writeEndElement(); // Episode
323
                pagedWriter.writeCharacters("\n")
324
                pagedWriter.writeEndElement(); // Trans
325

    
326
                ret = true
327

    
328
                pagedWriter.close()
329
                return ret
330
        }
331

    
332
        def writeSync(String txt) {
333
                //println "write time code $txt"
334
                pagedWriter.writeEmptyElement("Sync");
335
                if (txt.startsWith("¤")) { // ¤<5648>
336
                        String time = txt.substring(2, txt.length()-1)
337
                        pagedWriter.writeAttribute("time", time.substring(0,time.length()-3)+"."+time.substring(time.length()-3));
338
                } else {
339
                        String str = txt.substring(1, txt.length()-1)
340
                        Date date = formater.parse(str);
341
                        pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
342
                }
343
                pagedWriter.writeCharacters("\n")
344
        }
345
}