Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / HTML2TRS.groovy @ 1688

History | View | Annotate | Download (11.6 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import org.txm.utils.xml.DomUtils;
25
import org.w3c.tidy.Tidy
26
import org.txm.doc.*
27
import javax.xml.stream.*
28
import java.net.URL
29
import java.io.File
30
import java.net.URL
31
import java.util.HashMap
32
import java.util.List
33
import org.txm.scripts.importer.*
34
import org.xml.sax.Attributes
35
import org.txm.importer.scripts.filters.*
36
import java.io.File
37
import java.io.IOException
38
import java.util.ArrayList
39
import javax.xml.parsers.SAXParserFactory
40
import javax.xml.parsers.ParserConfigurationException
41
import javax.xml.parsers.SAXParser
42
import java.net.URL
43
import org.xml.sax.InputSource
44
import org.xml.sax.helpers.DefaultHandler
45

    
46
import java.text.DecimalFormat
47
import java.text.ParseException
48
import java.text.SimpleDateFormat
49
import java.util.Date;
50

    
51
class HTML2TRS {
52

    
53
        boolean inTurn = false
54
        def locs = new HashSet<String>()
55
        def lastTiming = "none"
56
        String localname = null
57
        String allTxt
58
        String header = ""
59
        String type = ""
60
        boolean startOfP = false
61
        boolean firstTurn = true
62
        boolean firstSync = true
63
        def loc = null
64
        int time = 0;
65
        boolean start= false;
66
        boolean startTranscription=false
67
        boolean isThemeOpened = false
68

    
69
        def inputData
70
        def factory
71
        def parser
72
        def formater
73
        def formater2
74
        StaxStackWriter pagedWriter
75
        public HTML2TRS() { }
76
        
77
        public process(File xhtmlFile, File trsFile) {
78
        
79
                println "remove entities and doctype"
80
                String text = xhtmlFile.getText("UTF-8")
81
                text = text.replaceAll("&nbsp;", " ")
82
                text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
83
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">""", "")
84
                xhtmlFile.withWriter("UTF-8") { writer ->
85
                        writer.write(text);
86
                }
87
                
88
                inputData = xhtmlFile.toURI().toURL().openStream();
89
                factory = XMLInputFactory.newInstance();
90
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
91
                parser = factory.createXMLStreamReader(inputData);
92
                
93
                formater = new SimpleDateFormat("h:mm:ss.S");
94
                formater.setTimeZone(TimeZone.getTimeZone("GMT"));
95
                formater2 = new DecimalFormat("#######.0")
96
                formater2.setMaximumFractionDigits(2)
97
        
98
                println "start writing TRS file"
99
                pagedWriter = new StaxStackWriter(trsFile, "UTF-8");
100
                pagedWriter.writeStartDocument("UTF-8", "1.0")
101
                //pagedWriter.writeDTD("<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">") // removed because parser has the "no DTD" option set
102
                pagedWriter.writeStartElement("Trans");
103
                pagedWriter.writeAttribute("scribe", "SCRIBE");
104
                pagedWriter.writeAttribute("audio_filename", "");
105
                pagedWriter.writeAttribute("version", "1.0");
106
                pagedWriter.writeAttribute("version_date", "110110");
107
                pagedWriter.writeStartElement("Speakers");
108
                pagedWriter.writeEndElement(); // Speakers
109

    
110
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
111
                        switch (event) {
112
                                case XMLStreamConstants.START_ELEMENT:
113
                                        localname = parser.getLocalName()
114
                                        //println "start elem: "+localname
115
                                        switch (localname) {
116
                                                case "body": println "START OF BODY";start = true;
117
                                                        break;
118
                                                case "p":
119
                                                        loc = null
120
                                                        allTxt = ""
121
                                                        type = "";
122
                                                        startOfP = true;
123
                                                        break
124
                                        }
125
                                        break;
126
                                case XMLStreamConstants.END_ELEMENT:
127
                                        localname = parser.getLocalName()
128
                                        switch (localname) {
129
                                                case "body": start = false;
130
                                                        break
131
                                                case "p":
132
                                                        //if (allTxt.length() > 0) allTxt = allTxt.substring(1)
133
                                                        //println "PTEXT '$allTxt'"
134
                                                        if (allTxt.indexOf("####") > 0) {
135
                                                                //println "SAY $txt"
136
                                                                if (allTxt.startsWith("\"")) allTxt = allTxt.substring(1)
137
                                                                loc = allTxt.substring(0, allTxt.indexOf("####")).trim()
138
                                                                locs.add(loc)
139
                                                                allTxt = allTxt.substring(allTxt.indexOf("####")+4) // remove " [A-Z]"
140
                                                                //println "LOC: '$loc'"
141
                                                                //println "LOC: '$allTxt'"
142
                                                        }
143
                                                        String trimedTxt = allTxt.trim()
144
                                                        if (startTranscription)
145
                                                        if (loc) {
146
                                                                inTurn = true;
147
                                                                pagedWriter.writeStartElement("Turn");
148
                                                                pagedWriter.writeAttribute("speaker", loc);
149
                                                                pagedWriter.writeAttribute("startTime", "");
150
                                                                pagedWriter.writeAttribute("endTime", "");
151
                                                                
152
                                                                // write Sync
153
                                                                if (firstTurn) {
154
                                                                        writeTimeCode("(0:00:00.0)")
155
                                                                        firstTurn = false;
156
                                                                } else {
157
                                                                        pagedWriter.writeEmptyElement("Sync")
158
                                                                        pagedWriter.writeAttribute("time", "");
159
                                                                }
160
                                                                
161
                                                                if (!processCommentAndSync(allTxt)) {
162
                                                                        if (allTxt.trim().length() > 0)
163
                                                                                println "ERROR $lastTiming 'contains ([]) but malformed' error line "+parser.getLocation().getLineNumber()+" : $allTxt"
164
                                                                }
165
                                                                //pagedWriter.writeCharacters(allTxt.replaceAll("\\(.+:..:..\\..+\\)", ""))
166
                                                                pagedWriter.writeEndElement(); // endTurn
167
                                                                
168
                                                                inTurn = false
169
                                                                time++;
170
                                                        } else { // Sync or Comment or Section : out of Turn 
171
                                                                if (allTxt.length() > 0)
172
                                                                if (trimedTxt.startsWith("(") && trimedTxt.endsWith(")")) {
173
                                                                        if (!processCommentAndSync(allTxt)) {
174
                                                                                if (allTxt.trim().length() > 0)
175
                                                                                        println "ERROR $lastTiming 'contains ([]) but malformed' line "+parser.getLocation().getLineNumber()+" : $allTxt"
176
                                                                        }
177
                                                                } else if (trimedTxt.startsWith("[") && trimedTxt.endsWith("]")) {
178
                                                                        println "SECTION: "+trimedTxt
179
                                                                        def section_regex = /([a-zA-Z]+)="([^"]+)"/
180
                                                                        def m = (trimedTxt =~ section_regex) // time
181
                                                                        if (m.size() > 0) {
182
                                                                                if (isThemeOpened)
183
                                                                                        pagedWriter.writeEndElement(); // previous Section
184
                                                                                isThemeOpened = true;
185
                                                                                pagedWriter.writeStartElement("Section");
186
                                                                                pagedWriter.writeAttribute("type", "report");
187
                                                                                pagedWriter.writeAttribute("topic", "");
188
                                                                                pagedWriter.writeAttribute("startTime", "");
189
                                                                                pagedWriter.writeAttribute("endTime", "");
190
                                                                                for (def match : m) {
191
                                                                                        if (match.size() == 3) {
192
                                                                                                pagedWriter.writeAttribute(match[1], match[2]);
193
                                                                                        } else {
194
                                                                                                println "ERROR $lastTiming malformed section attribute "+trimedTxt
195
                                                                                        }
196
                                                                                }
197
                                                                        } else {
198
                                                                                println "ERROR $lastTiming malformed section "+trimedTxt
199
                                                                        }
200
                                                                } else if (allTxt.trim().length() > 0) {
201
                                                                        println "ERROR $lastTiming 'not in Turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $allTxt"
202
                                                                }
203
                                                        }
204
                
205
                                                        allTxt = ""
206
                                                        break
207
                                        }
208
                                        break;
209
                                        
210
                                case XMLStreamConstants.CHARACTERS:
211
                                        String txt = parser.getText().replaceAll("\n", " ")
212
                                        allTxt += txt.replaceAll("    ", "####")
213
                                        //println allTxt+"\n\n"
214
                                        if (!startTranscription && start && txt.matches("\\(.+:..:..\\..+\\)")) {
215
                                                println "START with $txt"
216
                                                startTranscription = true
217
                                                
218
                                                pagedWriter.writeStartElement("Episode");
219
                                                pagedWriter.writeAttribute("program", "NA");
220
                                                pagedWriter.writeAttribute("air_date", "NA");
221
                                                isThemeOpened = true
222
                                                pagedWriter.writeStartElement("Section");
223
                                                pagedWriter.writeAttribute("type", "report");
224
                                                pagedWriter.writeAttribute("topic", "");
225
                                                pagedWriter.writeAttribute("startTime", "0.0");
226
                                                pagedWriter.writeAttribute("endTime", "");
227
                                        }
228
                                        //println "$start $startTranscription "+txt.length()+" $startOfP"
229
                                        if (start && startTranscription && txt.length() > 0 && startOfP) {
230
                                                //println "first P: $txt $loc"
231
                                                startOfP = false
232
                                        }
233
                                        break;
234
                                default: break;
235
                        }
236
                }
237
                boolean ret = true
238
                if (startTranscription) {
239
                        if (isThemeOpened) { // close previous theme
240
                                pagedWriter.writeEndElement(); // Section
241
                        }
242
                        
243
                        //pagedWriter.writeEndElement(); // Section
244
                        pagedWriter.writeEndElement(); // Episode
245
                        pagedWriter.writeEndElement(); // Trans
246

    
247
                        ret = true
248
                } else {
249
                        println "ERROR6: header not found"
250
                        ret = false;
251
                }
252
                pagedWriter.close()
253
                if (parser != null) parser.close();
254
                if (inputData != null) inputData.close();
255
                return ret
256
        }
257

    
258
        def writeTimeCode(String txt) {
259
                //println "write time code $txt"
260
                String str = txt.substring(1, txt.length()-1)
261
                Date date = formater.parse(str);
262
                lastTiming = str;
263
                if (inTurn || firstSync) {
264
                        pagedWriter.writeEmptyElement("Sync");
265
                        pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
266
                } else {
267
                        pagedWriter.writeStartElement("Turn");
268
                        pagedWriter.writeAttribute("speaker", "none");
269
                        pagedWriter.writeAttribute("startTime", "");
270
                        pagedWriter.writeAttribute("endTime", "");
271
                        inTurn = true
272
                        writeTimeCode(txt);
273
                        inTurn = false        
274
                        pagedWriter.writeEndElement(); // endTurn
275
                        //println "ERROR: can't write Sync out of Turn $txt at "+parser.getLocation().getLineNumber()
276
                }
277
                firstSync = false
278
        }
279
        
280
        //                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
281
        def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
282
        def processCommentAndSync(String txt) {
283
                if (txt.length() == 0) return true;
284
                //println "process sync text: "+txt
285
                boolean ret = true;
286
                //find timing
287
                //find comments
288
                def allBySync = []
289
                def all = []
290
                def splits = []
291
                def lastMatch = 0
292
                
293
                // split bullet from txt
294
                def m = (txt =~ bullet_regex) // time
295
                while (m.find()) {
296
                        //println "found: "+m.group()
297
                           // grab the string in between the end of the last match
298
                           // and the start of the current one (empty string if none)
299
                           allBySync << txt.substring(lastMatch, m.start())
300
                           // grab the delimiter
301
                           allBySync << m.group()
302
                           // keep looking from the end of the current match
303
                           lastMatch = m.end()
304
                }
305
                all = allBySync
306
                // grab everything after the end of the last match
307
                all << txt.substring(lastMatch)
308
                //println "** $txt >> $all"
309
                for (String str : all) {
310
                        if (str.trim().length() == 0) continue
311
                        if (str.matches(bullet_regex)) {
312
                                //println "TIMING: $str"
313
                                writeTimeCode(str)
314
                        } else {
315
                                int idxOpen = str.indexOf("(");
316
                                int idxClose = str.indexOf(")")
317
                                
318
                                while (idxOpen >=0 && idxClose > idxOpen) {
319
                                        if (str.substring(0, idxOpen).length() > 0) {
320
                                                if (loc != null) {
321
                                                        pagedWriter.writeCharacters(str.substring(0, idxOpen))
322
                                                        //println "SAY: "+str.substring(0, idxOpen)
323
                                                } else {
324
                                                        if (str.trim().length() > 0)
325
                                                                println "WARNING $lastTiming no locutor at line "+parser.getLocation().getLineNumber()+" : $str"
326
                                                }
327
                                        }
328
                                        pagedWriter.writeEmptyElement("Comment")
329
                                        pagedWriter.writeAttribute("desc", str.substring(idxOpen+1, idxClose))
330
                                        //println "COMMENT: "+str.substring(idxOpen+1, idxClose)
331
                                        str = str.substring(idxClose+1)
332
                                        idxOpen = str.indexOf("(");
333
                                        idxClose = str.indexOf(")")
334
                                }
335
                                if (str.length() > 0) {
336
                                        if (loc != null) {
337
                                                pagedWriter.writeCharacters(str)
338
                                        //        println "SAY: "+str
339
                                        } else {
340
                                                if (str.trim().length() > 0)
341
                                                        println "ERROR $lastTiming 'not in turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $str"
342
                                        }
343
                                }
344
                                if (str.contains("(") || str.contains("[") || str.contains(")") || str.contains("]")) {
345
                                        ret = false;
346
                                }
347
                  }
348
                }
349
                return ret;
350
        }
351
}