Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / HTML2TRS.groovy @ 479

History | View | Annotate | Download (11.5 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import org.txm.utils.xml.DomUtils;
25
import org.w3c.tidy.Tidy
26
import org.txm.doc.*
27
import javax.xml.stream.*
28
import java.net.URL
29
import java.io.File
30
import java.net.URL
31
import java.util.HashMap
32
import java.util.List
33
import org.txm.importer.*
34
import org.xml.sax.Attributes
35
import org.txm.importer.filters.*
36
import java.io.File
37
import java.io.IOException
38
import java.util.ArrayList
39
import javax.xml.parsers.SAXParserFactory
40
import javax.xml.parsers.ParserConfigurationException
41
import javax.xml.parsers.SAXParser
42
import java.net.URL
43
import org.xml.sax.InputSource
44
import org.xml.sax.helpers.DefaultHandler
45

    
46
import java.text.DecimalFormat
47
import java.text.ParseException
48
import java.text.SimpleDateFormat
49
import java.util.Date;
50

    
51
class HTML2TRS {
52

    
53
        boolean inTurn = false
54
        def locs = new HashSet<String>()
55
        def lastTiming = "none"
56
        String localname = null
57
        String allTxt
58
        String header = ""
59
        String type = ""
60
        boolean startOfP = false
61
        boolean firstTurn = true
62
        boolean firstSync = true
63
        def loc = null
64
        int time = 0;
65
        boolean start= false;
66
        boolean startTranscription=false
67
        boolean isThemeOpened = false
68

    
69
        def inputData
70
        def factory
71
        def parser
72
        def formater
73
        def formater2
74
        StaxStackWriter pagedWriter
75
        public HTML2TRS() { }
76
        
77
        public process(File xhtmlFile, File trsFile) {
78
        
79
                println "remove entities and doctype"
80
                String text = xhtmlFile.getText("UTF-8")
81
                text = text.replaceAll("&nbsp;", " ")
82
                text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
83
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">""", "")
84
                xhtmlFile.withWriter("UTF-8") { writer ->
85
                        writer.write(text);
86
                }
87
                
88
                inputData = xhtmlFile.toURI().toURL().openStream();
89
                factory = XMLInputFactory.newInstance();
90
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
91
                parser = factory.createXMLStreamReader(inputData);
92
                
93
                formater = new SimpleDateFormat("h:mm:ss.S");
94
                formater.setTimeZone(TimeZone.getTimeZone("GMT"));
95
                formater2 = new DecimalFormat("#######.0")
96
                formater2.setMaximumFractionDigits(2)
97
        
98
                println "start writing TRS file"
99
                pagedWriter = new StaxStackWriter(trsFile, "UTF-8");
100
                pagedWriter.writeStartDocument("UTF-8", "1.0")
101
                //pagedWriter.writeDTD("<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">") // removed because parser has the "no DTD" option set
102
                pagedWriter.writeStartElement("Trans");
103
                pagedWriter.writeAttribute("scribe", "SCRIBE");
104
                pagedWriter.writeAttribute("audio_filename", "");
105
                pagedWriter.writeAttribute("version", "1.0");
106
                pagedWriter.writeAttribute("version_date", "110110");
107
                pagedWriter.writeStartElement("Speakers");
108
                pagedWriter.writeEndElement(); // Speakers
109

    
110
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
111
                        switch (event) {
112
                                case XMLStreamConstants.START_ELEMENT:
113
                                        localname = parser.getLocalName()
114
                                        //println "start elem: "+localname
115
                                        switch (localname) {
116
                                                case "body": println "START OF BODY";start = true;
117
                                                        break;
118
                                                case "p":
119
                                                        loc = null
120
                                                        allTxt = ""
121
                                                        type = "";
122
                                                        startOfP = true;
123
                                                        break
124
                                        }
125
                                        break;
126
                                case XMLStreamConstants.END_ELEMENT:
127
                                        localname = parser.getLocalName()
128
                                        switch (localname) {
129
                                                case "body": start = false;
130
                                                        break
131
                                                case "p":
132
                                                        //if (allTxt.length() > 0) allTxt = allTxt.substring(1)
133
                                                        //println "PTEXT '$allTxt'"
134
                                                        if (allTxt.indexOf("####") > 0) {
135
                                                                //println "SAY $txt"
136
                                                                if (allTxt.startsWith("\"")) allTxt = allTxt.substring(1)
137
                                                                loc = allTxt.substring(0, allTxt.indexOf("####")).trim()
138
                                                                locs.add(loc)
139
                                                                allTxt = allTxt.substring(allTxt.indexOf("####")+4) // remove " [A-Z]"
140
                                                                //println "LOC: '$loc'"
141
                                                                //println "LOC: '$allTxt'"
142
                                                        }
143
                                                        String trimedTxt = allTxt.trim()
144
                                                        if (startTranscription)
145
                                                        if (loc) {
146
                                                                inTurn = true;
147
                                                                pagedWriter.writeStartElement("Turn");
148
                                                                pagedWriter.writeAttribute("speaker", loc);
149
                                                                pagedWriter.writeAttribute("startTime", "");
150
                                                                pagedWriter.writeAttribute("endTime", "");
151
                                                                
152
                                                                // write Sync
153
                                                                if (firstTurn) {
154
                                                                        writeTimeCode("(0:00:00.0)")
155
                                                                        firstTurn = false;
156
                                                                } else {
157
                                                                        pagedWriter.writeEmptyElement("Sync")
158
                                                                        pagedWriter.writeAttribute("time", "");
159
                                                                }
160
                                                                
161
                                                                if (!processCommentAndSync(allTxt)) {
162
                                                                        if (allTxt.trim().length() > 0)
163
                                                                                println "ERROR $lastTiming 'contains ([]) but malformed' error line "+parser.getLocation().getLineNumber()+" : $allTxt"
164
                                                                }
165
                                                                //pagedWriter.writeCharacters(allTxt.replaceAll("\\(.+:..:..\\..+\\)", ""))
166
                                                                pagedWriter.writeEndElement(); // endTurn
167
                                                                
168
                                                                inTurn = false
169
                                                                time++;
170
                                                        } else { // Sync or Comment or Section : out of Turn 
171
                                                                if (allTxt.length() > 0)
172
                                                                if (trimedTxt.startsWith("(") && trimedTxt.endsWith(")")) {
173
                                                                        if (!processCommentAndSync(allTxt)) {
174
                                                                                if (allTxt.trim().length() > 0)
175
                                                                                        println "ERROR $lastTiming 'contains ([]) but malformed' line "+parser.getLocation().getLineNumber()+" : $allTxt"
176
                                                                        }
177
                                                                } else if (trimedTxt.startsWith("[") && trimedTxt.endsWith("]")) {
178
                                                                        println "SECTION: "+trimedTxt
179
                                                                        def section_regex = /([a-zA-Z]+)="([^"]+)"/
180
                                                                        def m = (trimedTxt =~ section_regex) // time
181
                                                                        if (m.size() > 0) {
182
                                                                                if (isThemeOpened)
183
                                                                                        pagedWriter.writeEndElement(); // previous Section
184
                                                                                isThemeOpened = true;
185
                                                                                pagedWriter.writeStartElement("Section");
186
                                                                                pagedWriter.writeAttribute("type", "report");
187
                                                                                pagedWriter.writeAttribute("topic", "");
188
                                                                                pagedWriter.writeAttribute("startTime", "");
189
                                                                                pagedWriter.writeAttribute("endTime", "");
190
                                                                                for (def match : m) {
191
                                                                                        if (match.size() == 3) {
192
                                                                                                pagedWriter.writeAttribute(match[1], match[2]);
193
                                                                                        } else {
194
                                                                                                println "ERROR $lastTiming malformed section attribute "+trimedTxt
195
                                                                                        }
196
                                                                                }
197
                                                                        } else {
198
                                                                                println "ERROR $lastTiming malformed section "+trimedTxt
199
                                                                        }
200
                                                                } else if (allTxt.trim().length() > 0) {
201
                                                                        println "ERROR $lastTiming 'not in Turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $allTxt"
202
                                                                }
203
                                                        }
204
                
205
                                                        allTxt = ""
206
                                                        break
207
                                        }
208
                                        break;
209
                                        
210
                                case XMLStreamConstants.CHARACTERS:
211
                                        String txt = parser.getText().replaceAll("\n", " ")
212
                                        allTxt += txt.replaceAll("    ", "####")
213
                                        //println allTxt+"\n\n"
214
                                        if (!startTranscription && start && txt.matches("\\(.+:..:..\\..+\\)")) {
215
                                                println "START with $txt"
216
                                                startTranscription = true
217
                                                
218
                                                pagedWriter.writeStartElement("Episode");
219
                                                pagedWriter.writeAttribute("program", "NA");
220
                                                pagedWriter.writeAttribute("air_date", "NA");
221
                                                isThemeOpened = true
222
                                                pagedWriter.writeStartElement("Section");
223
                                                pagedWriter.writeAttribute("type", "report");
224
                                                pagedWriter.writeAttribute("topic", "");
225
                                                pagedWriter.writeAttribute("startTime", "0.0");
226
                                                pagedWriter.writeAttribute("endTime", "");
227
                                        }
228
                                        //println "$start $startTranscription "+txt.length()+" $startOfP"
229
                                        if (start && startTranscription && txt.length() > 0 && startOfP) {
230
                                                //println "first P: $txt $loc"
231
                                                startOfP = false
232
                                        }
233
                                        break;
234
                                default: break;
235
                        }
236
                }
237
                boolean ret = true
238
                if (startTranscription) {
239
                        if (isThemeOpened) { // close previous theme
240
                                pagedWriter.writeEndElement(); // Section
241
                        }
242
                        
243
                        //pagedWriter.writeEndElement(); // Section
244
                        pagedWriter.writeEndElement(); // Episode
245
                        pagedWriter.writeEndElement(); // Trans
246

    
247
                        ret = true
248
                } else {
249
                        println "ERROR6: header not found"
250
                        ret = false;
251
                }
252
                pagedWriter.close()
253
                return ret
254
        }
255

    
256
        def writeTimeCode(String txt) {
257
                //println "write time code $txt"
258
                String str = txt.substring(1, txt.length()-1)
259
                Date date = formater.parse(str);
260
                lastTiming = str;
261
                if (inTurn || firstSync) {
262
                        pagedWriter.writeEmptyElement("Sync");
263
                        pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
264
                } else {
265
                        pagedWriter.writeStartElement("Turn");
266
                        pagedWriter.writeAttribute("speaker", "none");
267
                        pagedWriter.writeAttribute("startTime", "");
268
                        pagedWriter.writeAttribute("endTime", "");
269
                        inTurn = true
270
                        writeTimeCode(txt);
271
                        inTurn = false        
272
                        pagedWriter.writeEndElement(); // endTurn
273
                        //println "ERROR: can't write Sync out of Turn $txt at "+parser.getLocation().getLineNumber()
274
                }
275
                firstSync = false
276
        }
277
        
278
        //                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
279
        def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
280
        def processCommentAndSync(String txt) {
281
                if (txt.length() == 0) return true;
282
                //println "process sync text: "+txt
283
                boolean ret = true;
284
                //find timing
285
                //find comments
286
                def allBySync = []
287
                def all = []
288
                def splits = []
289
                def lastMatch = 0
290
                
291
                // split bullet from txt
292
                def m = (txt =~ bullet_regex) // time
293
                while (m.find()) {
294
                        //println "found: "+m.group()
295
                           // grab the string in between the end of the last match
296
                           // and the start of the current one (empty string if none)
297
                           allBySync << txt.substring(lastMatch, m.start())
298
                           // grab the delimiter
299
                           allBySync << m.group()
300
                           // keep looking from the end of the current match
301
                           lastMatch = m.end()
302
                }
303
                all = allBySync
304
                // grab everything after the end of the last match
305
                all << txt.substring(lastMatch)
306
                //println "** $txt >> $all"
307
                for (String str : all) {
308
                        if (str.trim().length() == 0) continue
309
                        if (str.matches(bullet_regex)) {
310
                                //println "TIMING: $str"
311
                                writeTimeCode(str)
312
                        } else {
313
                                int idxOpen = str.indexOf("(");
314
                                int idxClose = str.indexOf(")")
315
                                
316
                                while (idxOpen >=0 && idxClose > idxOpen) {
317
                                        if (str.substring(0, idxOpen).length() > 0) {
318
                                                if (loc != null) {
319
                                                        pagedWriter.writeCharacters(str.substring(0, idxOpen))
320
                                                        //println "SAY: "+str.substring(0, idxOpen)
321
                                                } else {
322
                                                        if (str.trim().length() > 0)
323
                                                                println "WARNING $lastTiming no locutor at line "+parser.getLocation().getLineNumber()+" : $str"
324
                                                }
325
                                        }
326
                                        pagedWriter.writeEmptyElement("Comment")
327
                                        pagedWriter.writeAttribute("desc", str.substring(idxOpen+1, idxClose))
328
                                        //println "COMMENT: "+str.substring(idxOpen+1, idxClose)
329
                                        str = str.substring(idxClose+1)
330
                                        idxOpen = str.indexOf("(");
331
                                        idxClose = str.indexOf(")")
332
                                }
333
                                if (str.length() > 0) {
334
                                        if (loc != null) {
335
                                                pagedWriter.writeCharacters(str)
336
                                        //        println "SAY: "+str
337
                                        } else {
338
                                                if (str.trim().length() > 0)
339
                                                        println "ERROR $lastTiming 'not in turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $str"
340
                                        }
341
                                }
342
                                if (str.contains("(") || str.contains("[") || str.contains(")") || str.contains("]")) {
343
                                        ret = false;
344
                                }
345
                  }
346
                }
347
                return ret;
348
        }
349
}