Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macro / transcription / HTML2TRS.groovy @ 187

History | View | Annotate | Download (11 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import java.text.DecimalFormat
25
import java.text.SimpleDateFormat
26

    
27
import javax.xml.stream.*
28

    
29
import org.txm.doc.*
30
import org.txm.importer.*
31
import org.txm.importer.filters.*
32

    
33
class HTML2TRS {
34

    
35
        boolean inTurn = false
36
        def locs = new HashSet<String>()
37
        def lastTiming = "none"
38
        String localname = null
39
        String allTxt
40
        String header = ""
41
        String type = ""
42
        boolean startOfP = false
43
        boolean firstTurn = true
44
        boolean firstSync = true
45
        def loc = null
46
        int time = 0;
47
        boolean start= false;
48
        boolean startTranscription=false
49
        boolean isThemeOpened = false
50

    
51
        def inputData
52
        def factory
53
        def parser
54
        def formater
55
        def formater2
56
        StaxStackWriter pagedWriter
57
        public HTML2TRS() { }
58
        
59
        public process(File xhtmlFile, File trsFile) {
60
        
61
                println "remove entities and doctype"
62
                String text = xhtmlFile.getText("UTF-8")
63
                text = text.replaceAll("&nbsp;", " ")
64
                text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
65
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">""", "")
66
                xhtmlFile.withWriter("UTF-8") { writer ->
67
                        writer.write(text);
68
                }
69
                
70
                inputData = xhtmlFile.toURI().toURL().openStream();
71
                factory = XMLInputFactory.newInstance();
72
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
73
                parser = factory.createXMLStreamReader(inputData);
74
                
75
                formater = new SimpleDateFormat("h:mm:ss.S");
76
                formater.setTimeZone(TimeZone.getTimeZone("GMT"));
77
                formater2 = new DecimalFormat("#######.0")
78
                formater2.setMaximumFractionDigits(2)
79
        
80
                println "start writing TRS file"
81
                pagedWriter = new StaxStackWriter(trsFile, "UTF-8");
82
                pagedWriter.writeStartDocument("UTF-8", "1.0")
83
                //pagedWriter.writeDTD("<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">") // removed because parser has the "no DTD" option set
84
                pagedWriter.writeStartElement("Trans");
85
                pagedWriter.writeAttribute("scribe", "SCRIBE");
86
                pagedWriter.writeAttribute("audio_filename", "");
87
                pagedWriter.writeAttribute("version", "1.0");
88
                pagedWriter.writeAttribute("version_date", "110110");
89
                pagedWriter.writeStartElement("Speakers");
90
                pagedWriter.writeEndElement(); // Speakers
91

    
92
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
93
                        switch (event) {
94
                                case XMLStreamConstants.START_ELEMENT:
95
                                        localname = parser.getLocalName()
96
                                        //println "start elem: "+localname
97
                                        switch (localname) {
98
                                                case "body": println "START OF BODY";start = true;
99
                                                        break;
100
                                                case "p":
101
                                                        loc = null
102
                                                        allTxt = ""
103
                                                        type = "";
104
                                                        startOfP = true;
105
                                                        break
106
                                        }
107
                                        break;
108
                                case XMLStreamConstants.END_ELEMENT:
109
                                        localname = parser.getLocalName()
110
                                        switch (localname) {
111
                                                case "body": start = false;
112
                                                        break
113
                                                case "p":
114
                                                        //if (allTxt.length() > 0) allTxt = allTxt.substring(1)
115
                                                        //println "PTEXT '$allTxt'"
116
                                                        if (allTxt.indexOf("####") > 0) {
117
                                                                //println "SAY $txt"
118
                                                                if (allTxt.startsWith("\"")) allTxt = allTxt.substring(1)
119
                                                                loc = allTxt.substring(0, allTxt.indexOf("####")).trim()
120
                                                                locs.add(loc)
121
                                                                allTxt = allTxt.substring(allTxt.indexOf("####")+4) // remove " [A-Z]"
122
                                                                //println "LOC: '$loc'"
123
                                                                //println "LOC: '$allTxt'"
124
                                                        }
125
                                                        String trimedTxt = allTxt.trim()
126
                                                        if (startTranscription)
127
                                                        if (loc) {
128
                                                                inTurn = true;
129
                                                                pagedWriter.writeStartElement("Turn");
130
                                                                pagedWriter.writeAttribute("speaker", loc);
131
                                                                pagedWriter.writeAttribute("startTime", "");
132
                                                                pagedWriter.writeAttribute("endTime", "");
133
                                                                
134
                                                                // write Sync
135
                                                                if (firstTurn) {
136
                                                                        writeTimeCode("(0:00:00.0)")
137
                                                                        firstTurn = false;
138
                                                                } else {
139
                                                                        pagedWriter.writeEmptyElement("Sync")
140
                                                                        pagedWriter.writeAttribute("time", "");
141
                                                                }
142
                                                                
143
                                                                if (!processCommentAndSync(allTxt)) {
144
                                                                        if (allTxt.trim().length() > 0)
145
                                                                                println "ERROR $lastTiming 'contains ([]) but malformed' error line "+parser.getLocation().getLineNumber()+" : $allTxt"
146
                                                                }
147
                                                                //pagedWriter.writeCharacters(allTxt.replaceAll("\\(.+:..:..\\..+\\)", ""))
148
                                                                pagedWriter.writeEndElement(); // endTurn
149
                                                                
150
                                                                inTurn = false
151
                                                                time++;
152
                                                        } else { // Sync or Comment or Section : out of Turn 
153
                                                                if (allTxt.length() > 0)
154
                                                                if (trimedTxt.startsWith("(") && trimedTxt.endsWith(")")) {
155
                                                                        if (!processCommentAndSync(allTxt)) {
156
                                                                                if (allTxt.trim().length() > 0)
157
                                                                                        println "ERROR $lastTiming 'contains ([]) but malformed' line "+parser.getLocation().getLineNumber()+" : $allTxt"
158
                                                                        }
159
                                                                } else if (trimedTxt.startsWith("[") && trimedTxt.endsWith("]")) {
160
                                                                        println "SECTION: "+trimedTxt
161
                                                                        def section_regex = /([a-zA-Z]+)="([^"]+)"/
162
                                                                        def m = (trimedTxt =~ section_regex) // time
163
                                                                        if (m.size() > 0) {
164
                                                                                if (isThemeOpened)
165
                                                                                        pagedWriter.writeEndElement(); // previous Section
166
                                                                                isThemeOpened = true;
167
                                                                                pagedWriter.writeStartElement("Section");
168
                                                                                pagedWriter.writeAttribute("type", "report");
169
                                                                                pagedWriter.writeAttribute("topic", "");
170
                                                                                pagedWriter.writeAttribute("startTime", "");
171
                                                                                pagedWriter.writeAttribute("endTime", "");
172
                                                                                for (def match : m) {
173
                                                                                        if (match.size() == 3) {
174
                                                                                                pagedWriter.writeAttribute(match[1], match[2]);
175
                                                                                        } else {
176
                                                                                                println "ERROR $lastTiming malformed section attribute "+trimedTxt
177
                                                                                        }
178
                                                                                }
179
                                                                        } else {
180
                                                                                println "ERROR $lastTiming malformed section "+trimedTxt
181
                                                                        }
182
                                                                } else if (allTxt.trim().length() > 0) {
183
                                                                        println "ERROR $lastTiming 'not in Turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $allTxt"
184
                                                                }
185
                                                        }
186
                
187
                                                        allTxt = ""
188
                                                        break
189
                                        }
190
                                        break;
191
                                        
192
                                case XMLStreamConstants.CHARACTERS:
193
                                        String txt = parser.getText().replaceAll("\n", " ")
194
                                        allTxt += txt.replaceAll("    ", "####")
195
                                        //println allTxt+"\n\n"
196
                                        if (!startTranscription && start && txt.matches("\\(.+:..:..\\..+\\)")) {
197
                                                println "START with $txt"
198
                                                startTranscription = true
199
                                                
200
                                                pagedWriter.writeStartElement("Episode");
201
                                                pagedWriter.writeAttribute("program", "NA");
202
                                                pagedWriter.writeAttribute("air_date", "NA");
203
                                                isThemeOpened = true
204
                                                pagedWriter.writeStartElement("Section");
205
                                                pagedWriter.writeAttribute("type", "report");
206
                                                pagedWriter.writeAttribute("topic", "");
207
                                                pagedWriter.writeAttribute("startTime", "0.0");
208
                                                pagedWriter.writeAttribute("endTime", "");
209
                                        }
210
                                        //println "$start $startTranscription "+txt.length()+" $startOfP"
211
                                        if (start && startTranscription && txt.length() > 0 && startOfP) {
212
                                                //println "first P: $txt $loc"
213
                                                startOfP = false
214
                                        }
215
                                        break;
216
                                default: break;
217
                        }
218
                }
219
                boolean ret = true
220
                if (startTranscription) {
221
                        if (isThemeOpened) { // close previous theme
222
                                pagedWriter.writeEndElement(); // Section
223
                        }
224
                        
225
                        //pagedWriter.writeEndElement(); // Section
226
                        pagedWriter.writeEndElement(); // Episode
227
                        pagedWriter.writeEndElement(); // Trans
228

    
229
                        ret = true
230
                } else {
231
                        println "ERROR6: header not found"
232
                        ret = false;
233
                }
234
                pagedWriter.close()
235
                return ret
236
        }
237

    
238
        def writeTimeCode(String txt) {
239
                //println "write time code $txt"
240
                String str = txt.substring(1, txt.length()-1)
241
                Date date = formater.parse(str);
242
                lastTiming = str;
243
                if (inTurn || firstSync) {
244
                        pagedWriter.writeEmptyElement("Sync");
245
                        pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
246
                } else {
247
                        pagedWriter.writeStartElement("Turn");
248
                        pagedWriter.writeAttribute("speaker", "none");
249
                        pagedWriter.writeAttribute("startTime", "");
250
                        pagedWriter.writeAttribute("endTime", "");
251
                        inTurn = true
252
                        writeTimeCode(txt);
253
                        inTurn = false        
254
                        pagedWriter.writeEndElement(); // endTurn
255
                        //println "ERROR: can't write Sync out of Turn $txt at "+parser.getLocation().getLineNumber()
256
                }
257
                firstSync = false
258
        }
259
        
260
        //                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
261
        def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
262
        def processCommentAndSync(String txt) {
263
                if (txt.length() == 0) return true;
264
                //println "process sync text: "+txt
265
                boolean ret = true;
266
                //find timing
267
                //find comments
268
                def allBySync = []
269
                def all = []
270
                def splits = []
271
                def lastMatch = 0
272
                
273
                // split bullet from txt
274
                def m = (txt =~ bullet_regex) // time
275
                while (m.find()) {
276
                        //println "found: "+m.group()
277
                           // grab the string in between the end of the last match
278
                           // and the start of the current one (empty string if none)
279
                           allBySync << txt.substring(lastMatch, m.start())
280
                           // grab the delimiter
281
                           allBySync << m.group()
282
                           // keep looking from the end of the current match
283
                           lastMatch = m.end()
284
                }
285
                all = allBySync
286
                // grab everything after the end of the last match
287
                all << txt.substring(lastMatch)
288
                //println "** $txt >> $all"
289
                for (String str : all) {
290
                        if (str.trim().length() == 0) continue
291
                        if (str.matches(bullet_regex)) {
292
                                //println "TIMING: $str"
293
                                writeTimeCode(str)
294
                        } else {
295
                                int idxOpen = str.indexOf("(");
296
                                int idxClose = str.indexOf(")")
297
                                
298
                                while (idxOpen >=0 && idxClose > idxOpen) {
299
                                        if (str.substring(0, idxOpen).length() > 0) {
300
                                                if (loc != null) {
301
                                                        pagedWriter.writeCharacters(str.substring(0, idxOpen))
302
                                                        //println "SAY: "+str.substring(0, idxOpen)
303
                                                } else {
304
                                                        if (str.trim().length() > 0)
305
                                                                println "WARNING $lastTiming no locutor at line "+parser.getLocation().getLineNumber()+" : $str"
306
                                                }
307
                                        }
308
                                        pagedWriter.writeEmptyElement("Comment")
309
                                        pagedWriter.writeAttribute("desc", str.substring(idxOpen+1, idxClose))
310
                                        //println "COMMENT: "+str.substring(idxOpen+1, idxClose)
311
                                        str = str.substring(idxClose+1)
312
                                        idxOpen = str.indexOf("(");
313
                                        idxClose = str.indexOf(")")
314
                                }
315
                                if (str.length() > 0) {
316
                                        if (loc != null) {
317
                                                pagedWriter.writeCharacters(str)
318
                                        //        println "SAY: "+str
319
                                        } else {
320
                                                if (str.trim().length() > 0)
321
                                                        println "ERROR $lastTiming 'not in turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $str"
322
                                        }
323
                                }
324
                                if (str.contains("(") || str.contains("[") || str.contains(")") || str.contains("]")) {
325
                                        ret = false;
326
                                }
327
                  }
328
                }
329
                return ret;
330
        }
331
}