Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / TXT2TRS.groovy @ 479

History | View | Annotate | Download (12.1 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21
package org.txm.macro.transcription
22

    
23
import org.txm.utils.xml.DomUtils;
24
import org.w3c.tidy.Tidy
25
import org.txm.doc.*
26

    
27
import javax.xml.stream.*
28

    
29
import java.util.HashMap
30
import java.util.List
31

    
32
import org.txm.importer.*
33
import org.xml.sax.Attributes
34
import org.txm.importer.filters.*
35
import org.txm.utils.io.IOUtils;
36
import org.txm.utils.i18n.DetectBOM
37

    
38
import java.io.File
39
import java.io.IOException
40
import java.util.ArrayList
41

    
42
import javax.xml.parsers.SAXParserFactory
43
import javax.xml.parsers.ParserConfigurationException
44
import javax.xml.parsers.SAXParser
45

    
46
import java.net.URL
47

    
48
import org.xml.sax.InputSource
49
import org.xml.sax.helpers.DefaultHandler
50

    
51
import java.text.DecimalFormat
52
import java.text.ParseException
53
import java.text.SimpleDateFormat
54
import java.util.Date;
55

    
56
/**
57
 * Parse a formatted a TXT file to create a TRS file.
58
 * 
59
 * Manage turns, syncs, locutors, section.
60
 * 
61
 * The parsing of the TXT file starts at the first time code
62
 *  
63
 * @author mdecorde, sheiden
64
 *
65
 */
66
class TXT2TRS {
67

    
68
        boolean inTurn = false
69
        def locs = new HashSet<String>()
70
        def lastTiming = "none"
71
        String localname = null
72
        String allTxt
73
        String header = ""
74
        String type = ""
75
        boolean startOfP = false
76
        boolean firstTurn = true
77
        boolean firstSync = true
78
        def loc = null
79
        int time = 0;
80
        boolean start= false;
81
        boolean startTranscription=false
82
        boolean isThemeOpened = false
83

    
84
        def formater
85
        def formater_without_ms
86
        def formater2
87
        StaxStackWriter pagedWriter
88
        
89
        
90
        //                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>)/
91
        def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9][0-9]?)?\)|(¤<[0-9]+>)/
92
        def turn_with_bullet = /.+(\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>))/
93
        //                  /([a-zA-Z]+)="([^"]+)"/
94
        def section_regex = /([a-zA-Z]+)="([^"]+)"/
95
        
96
        public TXT2TRS() { }
97

    
98
        public boolean process(File txtFile, File trsFile) {
99
//
100
                FileInputStream input = new FileInputStream(txtFile);
101
                Reader reader = new InputStreamReader(input , "UTF-8");
102
                
103
                for (int i = 0 ; i < new DetectBOM(txtFile).getBOMSize() ; i++) input.read();
104
                def lines = reader.readLines();
105

    
106
                formater = new SimpleDateFormat("h:mm:ss.S");
107
                formater_without_ms = new SimpleDateFormat("h:mm:ss");
108
                formater.setTimeZone(TimeZone.getTimeZone("GMT"));
109
                formater_without_ms.setTimeZone(TimeZone.getTimeZone("GMT"));
110
                formater2 = new DecimalFormat("#######.0")
111
                formater2.setMaximumFractionDigits(2)
112

    
113
                //println "Start writing TRS file..."
114
                pagedWriter = new StaxStackWriter(trsFile, "UTF-8");
115
                pagedWriter.writeStartDocument("UTF-8", "1.0")
116
                //pagedWriter.writeDTD("<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">") // removed because parser has the "no DTD" option set
117
                pagedWriter.writeStartElement("Trans");
118
                pagedWriter.writeAttribute("scribe", "SCRIBE");
119
                pagedWriter.writeAttribute("audio_filename", "");
120
                pagedWriter.writeAttribute("version", "1.0");
121
                pagedWriter.writeAttribute("version_date", "110110");
122
                pagedWriter.writeStartElement("Speakers");
123
                pagedWriter.writeEndElement(); // Speakers
124
                
125
                pagedWriter.writeStartElement("Episode");
126
                pagedWriter.writeAttribute("program", "NA");
127
                pagedWriter.writeAttribute("air_date", "NA");
128
                isThemeOpened = false
129
                def firstTurn = true
130

    
131
                // test if the transcription structure is : comment, start line, end line
132
                def trslines = []
133
                boolean started = false
134
                for (String line : lines) {
135
                        line = line.trim()
136
                        if (line.matches(bullet_regex)) {
137
                                started = true;
138
                        }
139

    
140
                        if (started) { // get ttranscription lines
141
                                if (line.length() > 0) trslines << line
142
                        }
143
                }
144

    
145
                if (!started) {
146
                        println "The $txtFile file does not contain the first time bullet"
147
                        return false;
148
                }
149
                String lastLine = trslines[-1]
150
                if (!lastLine.matches(bullet_regex)) {
151
                        // check if last turn has a bullet at the end
152
                        boolean bulletIsMissing = true;
153
                        for (int i = trslines.size() - 1 ; i > 0 ; i--) {
154
                                String line = trslines[i].trim()
155
                                def split = line.split("\t", 2)
156
                                //println "$line "+split.length+ " VS "+turn_with_bullet
157
                                if (split.length == 2) {
158
                                         if (line.matches(turn_with_bullet)) {
159
                                                bulletIsMissing = false;
160
                                        } 
161
                                        println "END $bulletIsMissing"
162
                                        i=-1
163
                                }
164
                        }
165
                        if (bulletIsMissing) {
166
                                println "Error: The $txtFile file last line or last turn does not contain the end time bullet: $bullet_regex"
167
                                return false;
168
                        }
169
                }
170

    
171
                lines = trslines
172

    
173
                def currentTime = lines[0] // get starting time
174

    
175
                // group lines per Turn, Event, Comment...
176
                def groups = [["sync", lines[0]]]
177
                lines.remove(0)
178
                def currentGroup = []
179

    
180
                for (String line : lines) {
181
                        if (line.indexOf("\t") > 0) { // start of Turn
182
                                def split = line.split("\t", 2)
183
                                def loc = split[0]
184
                                def content = split[1]
185

    
186
                                if (currentGroup.size() > 0) {groups << currentGroup} // store previous group
187
                                currentGroup = ["turn", loc, content];
188

    
189
                        } else if (line.matches("(\\([^)]+\\) *)+")) {//("(") && line.endsWith(")")) { // comment or timing
190
                                if (line.matches(bullet_regex)) { // Sync in a Turn
191
                                        if (currentGroup.size() > 0) {
192
                                                currentGroup << line
193
                                        } else {
194
                                                if (currentGroup.size() > 0) {groups << currentGroup} // store previous group
195
                                                groups << ["sync", line]
196
                                        }
197
                                } else { // comment -> ignored
198
                                        if (currentGroup.size() > 0) {groups << currentGroup} // store previous group
199
                                        currentGroup = ["comment", line];
200
                                }
201
                        } else if (line.startsWith("[") && line.endsWith("]")) { // section, end of Turn
202
                                if (currentGroup.size() > 0) groups << currentGroup
203
                                groups << ["section", line]
204
                        } else { // something that happens during a Turn
205
                                currentGroup << line
206
                                
207
                        }
208
                }
209

    
210
                for (def g : groups) {
211
                        //println g
212
                        if (g[0] == "turn") {
213
                                String loc = g[1]
214
                                //println "process turn: "+g
215
                                def tokens = []
216
                                for (int i = 2 ; i < g.size() ; i++) {// speach starts for position 2
217
                                        //println " process speach: "+g[i]
218
                                        //         tokenize time bullets, comment and speach
219

    
220

    
221
                                        def turntokens = []
222
                                        def toProcess = [g[i]]
223
                                        def lastMatch = 0
224
                                        // find Sync
225
                                        while (toProcess.size() > 0) {
226
                                                String txt = toProcess.pop()
227
                                                boolean nomatch = true
228
                                                def m = (txt =~ bullet_regex) // time
229
                                                //if (m.size() > 0) {
230
                                                while (m.find()) {
231
                                                        nomatch = false
232
                                                        //println "found: "+m.group()
233
                                                        // grab the string in between the end of the last match
234
                                                        // and the start of the current one (empty string if none)
235
                                                        String before = txt.substring(lastMatch, m.start())
236
                                                        if (before.length() > 0) turntokens << before
237
                                                        // grab the delimiter
238
                                                        turntokens << m.group()
239
                                                        // keep looking from the end of the current match
240
                                                        lastMatch = m.end()
241
                                                }
242
                                                if (nomatch) turntokens << txt
243
                                                if (lastMatch > 0 && lastMatch < txt.length()) turntokens << txt.substring(lastMatch)
244
                                        }
245
                                        //println "  speach+syncs="+turntokens
246

    
247
                                        // find Comments (....)
248
                                        toProcess = turntokens
249
                                        turntokens = []
250
                                        
251
                                        while (toProcess.size() > 0) {
252
                                                String txt = toProcess.remove(0)
253
                                                
254
                                                boolean nomatch = true
255
                                                def m = (txt =~ /(\([^)]+\))/) // time
256
                                                //if (m.size() > 0) {
257
                                                lastMatch = 0
258
                                                while (m.find()) {
259
                                                        nomatch = false
260
                                                        //println "found: "+m.group()
261
                                                        // grab the string in between the end of the last match
262
                                                        // and the start of the current one (empty string if none)
263
                                                        String before = txt.substring(lastMatch, m.start())
264
                                                        if (before.length() > 0) turntokens << before
265
                                                        // grab the delimiter
266
                                                        turntokens << m.group()
267
                                                        // keep looking from the end of the current match
268
                                                        lastMatch = m.end()
269
                                                }
270
                                                if (nomatch) turntokens << txt
271
                                                if (lastMatch > 0 && lastMatch < txt.length()) turntokens << txt.substring(lastMatch)
272
                                        }
273
                                        //println "  speach+syncs+comments="+turntokens
274
                                        //                                        if (!processCommentAndSync(g[i])) {
275
                                        //                                                println "ERROR $lastTiming 'contains ([]) but malformed' error line "+parser.getLocation().getLineNumber()+" : $allTxt"
276
                                        //                                        }
277
                                        tokens.addAll(turntokens)
278
                                }
279

    
280
                                if (firstTurn && !isThemeOpened) {
281
                                        isThemeOpened = true;
282
                                        
283
                                        pagedWriter.writeStartElement("Section");
284
                                        pagedWriter.writeAttribute("type", "report");
285
                                        pagedWriter.writeAttribute("topic", "");
286
                                        pagedWriter.writeAttribute("startTime", "0.0");
287
                                        pagedWriter.writeAttribute("endTime", "");
288
                                }
289
                                firstTurn = false
290
                                pagedWriter.writeCharacters("\n")
291
                                pagedWriter.writeStartElement("Turn");
292
                                pagedWriter.writeAttribute("speaker", loc);
293
                                pagedWriter.writeAttribute("startTime", "");
294
                                pagedWriter.writeAttribute("endTime", "");
295
                                pagedWriter.writeCharacters("\n")
296
                                writeSync(currentTime)
297

    
298
                                String previousToken = ""
299
                                for (def t : tokens) {
300
                                        if (t.matches(bullet_regex)) {
301
                                                writeSync(t)
302
                                                currentTime = t
303
                                        }  else if (t.startsWith("(") && t.endsWith(")") ){
304
                                                pagedWriter.writeCharacters("\n")
305
                                                pagedWriter.writeEmptyElement("Comment")
306
                                                pagedWriter.writeAttribute("desc", t.substring(1, t.length() -1))
307
                                        }else {
308
                                                pagedWriter.writeCharacters(" "+t)
309
                                        }
310
                                }
311

    
312
                                pagedWriter.writeCharacters("\n")
313
                                pagedWriter.writeEndElement(); // Turn
314
                        } else if (g[0] == "sync") {
315
                                currentTime = g[1]
316
                                writeSync(currentTime)
317
                        } else if (g[0] == "comment") {
318
//                                println "process comment: "+g[1]
319
                                pagedWriter.writeCharacters("\n")
320
                                pagedWriter.writeEmptyElement("Comment")
321
                                pagedWriter.writeAttribute("desc", g[1].substring(1, g[1].length() -1))
322
                        } else if (g[0] == "section") {
323
//                                println "process section: "+g[1]
324
                                
325
                                def m = (g[1] =~ section_regex) // time
326
                                if (m.size() > 0) {
327
                                        if (isThemeOpened) {
328
                                                pagedWriter.writeCharacters("\n")
329
                                                pagedWriter.writeEndElement(); // previous Section
330
                                        }
331
                                        isThemeOpened = true;
332
                                        pagedWriter.writeCharacters("\n")
333
                                        pagedWriter.writeStartElement("Section");
334
                                        def attr = ["type":"report", "topic":"", "startTime":"", "endTime":""]
335
                                        
336
                                        for (def match : m) {
337
                                                if (match.size() == 3) {
338
                                                        attr[match[1]]=match[2];
339
                                                } else {
340
                                                        println "ERROR $lastTiming malformed section attribute "+g[1]+ "near $currentTime"
341
                                                }
342
                                        }
343
                                        
344
                                        // write attributes
345
                                        for (def k : attr.keySet())        pagedWriter.writeAttribute(k, attr[k]);
346
                                } else {
347
                                        println "ERROR $lastTiming malformed section "+g[1]
348
                                }
349
                        } else {
350
                                println "Error: found malformed line: "+g
351
                        }
352
                }
353

    
354
                boolean ret = true
355
                if (isThemeOpened) { // close previous theme
356
                        pagedWriter.writeCharacters("\n")
357
                        pagedWriter.writeEndElement(); // Section
358
                }
359

    
360
                //pagedWriter.writeEndElement(); // Section
361
                pagedWriter.writeCharacters("\n")
362
                pagedWriter.writeEndElement(); // Episode
363
                pagedWriter.writeCharacters("\n")
364
                pagedWriter.writeEndElement(); // Trans
365

    
366
                ret = true
367

    
368
                pagedWriter.close()
369
                return ret
370
        }
371

    
372
        def writeSync(String txt) {
373
                //println "write time code $txt"
374
                pagedWriter.writeEmptyElement("Sync");
375
                if (txt.startsWith("¤")) { // ¤<5648>
376
                        String time = txt.substring(2, txt.length()-1)
377
                        pagedWriter.writeAttribute("time", time.substring(0,time.length()-3)+"."+time.substring(time.length()-3));
378
                } else {
379
                        String str = txt.substring(1, txt.length()-1)
380
                        Date date = null;
381
                        try {date = formater.parse(str);} catch(Exception e1){}
382
                        try {date = formater_without_ms.parse(str);} catch(Exception e2){println "Failed to parse time: "+str; return;}
383
                        pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
384
                }
385
                pagedWriter.writeCharacters("\n")
386
        }
387
}