Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / HTML2TRS.groovy @ 499

History | View | Annotate | Download (11.5 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
package org.txm.macro.transcription
23 321 mdecorde
24 479 mdecorde
import org.txm.utils.xml.DomUtils;
25 479 mdecorde
import org.w3c.tidy.Tidy
26 479 mdecorde
import org.txm.doc.*
27 321 mdecorde
import javax.xml.stream.*
28 479 mdecorde
import java.net.URL
29 479 mdecorde
import java.io.File
30 479 mdecorde
import java.net.URL
31 479 mdecorde
import java.util.HashMap
32 479 mdecorde
import java.util.List
33 321 mdecorde
import org.txm.importer.*
34 479 mdecorde
import org.xml.sax.Attributes
35 321 mdecorde
import org.txm.importer.filters.*
36 479 mdecorde
import java.io.File
37 479 mdecorde
import java.io.IOException
38 479 mdecorde
import java.util.ArrayList
39 479 mdecorde
import javax.xml.parsers.SAXParserFactory
40 479 mdecorde
import javax.xml.parsers.ParserConfigurationException
41 479 mdecorde
import javax.xml.parsers.SAXParser
42 479 mdecorde
import java.net.URL
43 479 mdecorde
import org.xml.sax.InputSource
44 479 mdecorde
import org.xml.sax.helpers.DefaultHandler
45 321 mdecorde
46 479 mdecorde
import java.text.DecimalFormat
47 479 mdecorde
import java.text.ParseException
48 479 mdecorde
import java.text.SimpleDateFormat
49 479 mdecorde
import java.util.Date;
50 479 mdecorde
51 321 mdecorde
class HTML2TRS {
52 321 mdecorde
53 321 mdecorde
        boolean inTurn = false
54 321 mdecorde
        def locs = new HashSet<String>()
55 321 mdecorde
        def lastTiming = "none"
56 321 mdecorde
        String localname = null
57 321 mdecorde
        String allTxt
58 321 mdecorde
        String header = ""
59 321 mdecorde
        String type = ""
60 321 mdecorde
        boolean startOfP = false
61 321 mdecorde
        boolean firstTurn = true
62 321 mdecorde
        boolean firstSync = true
63 321 mdecorde
        def loc = null
64 321 mdecorde
        int time = 0;
65 321 mdecorde
        boolean start= false;
66 321 mdecorde
        boolean startTranscription=false
67 321 mdecorde
        boolean isThemeOpened = false
68 321 mdecorde
69 321 mdecorde
        def inputData
70 321 mdecorde
        def factory
71 321 mdecorde
        def parser
72 321 mdecorde
        def formater
73 321 mdecorde
        def formater2
74 321 mdecorde
        StaxStackWriter pagedWriter
75 321 mdecorde
        public HTML2TRS() { }
76 321 mdecorde
77 321 mdecorde
        public process(File xhtmlFile, File trsFile) {
78 321 mdecorde
79 321 mdecorde
                println "remove entities and doctype"
80 321 mdecorde
                String text = xhtmlFile.getText("UTF-8")
81 321 mdecorde
                text = text.replaceAll("&nbsp;", " ")
82 321 mdecorde
                text = text.replaceAll("""<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
83 321 mdecorde
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">""", "")
84 321 mdecorde
                xhtmlFile.withWriter("UTF-8") { writer ->
85 321 mdecorde
                        writer.write(text);
86 321 mdecorde
                }
87 321 mdecorde
88 321 mdecorde
                inputData = xhtmlFile.toURI().toURL().openStream();
89 321 mdecorde
                factory = XMLInputFactory.newInstance();
90 321 mdecorde
                factory.setProperty(XMLInputFactory.SUPPORT_DTD, Boolean.FALSE);
91 321 mdecorde
                parser = factory.createXMLStreamReader(inputData);
92 321 mdecorde
93 321 mdecorde
                formater = new SimpleDateFormat("h:mm:ss.S");
94 321 mdecorde
                formater.setTimeZone(TimeZone.getTimeZone("GMT"));
95 321 mdecorde
                formater2 = new DecimalFormat("#######.0")
96 321 mdecorde
                formater2.setMaximumFractionDigits(2)
97 321 mdecorde
98 321 mdecorde
                println "start writing TRS file"
99 321 mdecorde
                pagedWriter = new StaxStackWriter(trsFile, "UTF-8");
100 321 mdecorde
                pagedWriter.writeStartDocument("UTF-8", "1.0")
101 321 mdecorde
                //pagedWriter.writeDTD("<!DOCTYPE Trans SYSTEM \"trans-14.dtd\">") // removed because parser has the "no DTD" option set
102 321 mdecorde
                pagedWriter.writeStartElement("Trans");
103 321 mdecorde
                pagedWriter.writeAttribute("scribe", "SCRIBE");
104 321 mdecorde
                pagedWriter.writeAttribute("audio_filename", "");
105 321 mdecorde
                pagedWriter.writeAttribute("version", "1.0");
106 321 mdecorde
                pagedWriter.writeAttribute("version_date", "110110");
107 321 mdecorde
                pagedWriter.writeStartElement("Speakers");
108 321 mdecorde
                pagedWriter.writeEndElement(); // Speakers
109 321 mdecorde
110 321 mdecorde
                for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
111 321 mdecorde
                        switch (event) {
112 321 mdecorde
                                case XMLStreamConstants.START_ELEMENT:
113 321 mdecorde
                                        localname = parser.getLocalName()
114 321 mdecorde
                                        //println "start elem: "+localname
115 321 mdecorde
                                        switch (localname) {
116 321 mdecorde
                                                case "body": println "START OF BODY";start = true;
117 321 mdecorde
                                                        break;
118 321 mdecorde
                                                case "p":
119 321 mdecorde
                                                        loc = null
120 321 mdecorde
                                                        allTxt = ""
121 321 mdecorde
                                                        type = "";
122 321 mdecorde
                                                        startOfP = true;
123 321 mdecorde
                                                        break
124 321 mdecorde
                                        }
125 321 mdecorde
                                        break;
126 321 mdecorde
                                case XMLStreamConstants.END_ELEMENT:
127 321 mdecorde
                                        localname = parser.getLocalName()
128 321 mdecorde
                                        switch (localname) {
129 321 mdecorde
                                                case "body": start = false;
130 321 mdecorde
                                                        break
131 321 mdecorde
                                                case "p":
132 321 mdecorde
                                                        //if (allTxt.length() > 0) allTxt = allTxt.substring(1)
133 321 mdecorde
                                                        //println "PTEXT '$allTxt'"
134 321 mdecorde
                                                        if (allTxt.indexOf("####") > 0) {
135 321 mdecorde
                                                                //println "SAY $txt"
136 321 mdecorde
                                                                if (allTxt.startsWith("\"")) allTxt = allTxt.substring(1)
137 321 mdecorde
                                                                loc = allTxt.substring(0, allTxt.indexOf("####")).trim()
138 321 mdecorde
                                                                locs.add(loc)
139 321 mdecorde
                                                                allTxt = allTxt.substring(allTxt.indexOf("####")+4) // remove " [A-Z]"
140 321 mdecorde
                                                                //println "LOC: '$loc'"
141 321 mdecorde
                                                                //println "LOC: '$allTxt'"
142 321 mdecorde
                                                        }
143 321 mdecorde
                                                        String trimedTxt = allTxt.trim()
144 321 mdecorde
                                                        if (startTranscription)
145 321 mdecorde
                                                        if (loc) {
146 321 mdecorde
                                                                inTurn = true;
147 321 mdecorde
                                                                pagedWriter.writeStartElement("Turn");
148 321 mdecorde
                                                                pagedWriter.writeAttribute("speaker", loc);
149 321 mdecorde
                                                                pagedWriter.writeAttribute("startTime", "");
150 321 mdecorde
                                                                pagedWriter.writeAttribute("endTime", "");
151 321 mdecorde
152 321 mdecorde
                                                                // write Sync
153 321 mdecorde
                                                                if (firstTurn) {
154 321 mdecorde
                                                                        writeTimeCode("(0:00:00.0)")
155 321 mdecorde
                                                                        firstTurn = false;
156 321 mdecorde
                                                                } else {
157 321 mdecorde
                                                                        pagedWriter.writeEmptyElement("Sync")
158 321 mdecorde
                                                                        pagedWriter.writeAttribute("time", "");
159 321 mdecorde
                                                                }
160 321 mdecorde
161 321 mdecorde
                                                                if (!processCommentAndSync(allTxt)) {
162 321 mdecorde
                                                                        if (allTxt.trim().length() > 0)
163 321 mdecorde
                                                                                println "ERROR $lastTiming 'contains ([]) but malformed' error line "+parser.getLocation().getLineNumber()+" : $allTxt"
164 321 mdecorde
                                                                }
165 321 mdecorde
                                                                //pagedWriter.writeCharacters(allTxt.replaceAll("\\(.+:..:..\\..+\\)", ""))
166 321 mdecorde
                                                                pagedWriter.writeEndElement(); // endTurn
167 321 mdecorde
168 321 mdecorde
                                                                inTurn = false
169 321 mdecorde
                                                                time++;
170 321 mdecorde
                                                        } else { // Sync or Comment or Section : out of Turn
171 321 mdecorde
                                                                if (allTxt.length() > 0)
172 321 mdecorde
                                                                if (trimedTxt.startsWith("(") && trimedTxt.endsWith(")")) {
173 321 mdecorde
                                                                        if (!processCommentAndSync(allTxt)) {
174 321 mdecorde
                                                                                if (allTxt.trim().length() > 0)
175 321 mdecorde
                                                                                        println "ERROR $lastTiming 'contains ([]) but malformed' line "+parser.getLocation().getLineNumber()+" : $allTxt"
176 321 mdecorde
                                                                        }
177 321 mdecorde
                                                                } else if (trimedTxt.startsWith("[") && trimedTxt.endsWith("]")) {
178 321 mdecorde
                                                                        println "SECTION: "+trimedTxt
179 321 mdecorde
                                                                        def section_regex = /([a-zA-Z]+)="([^"]+)"/
180 321 mdecorde
                                                                        def m = (trimedTxt =~ section_regex) // time
181 321 mdecorde
                                                                        if (m.size() > 0) {
182 321 mdecorde
                                                                                if (isThemeOpened)
183 321 mdecorde
                                                                                        pagedWriter.writeEndElement(); // previous Section
184 321 mdecorde
                                                                                isThemeOpened = true;
185 321 mdecorde
                                                                                pagedWriter.writeStartElement("Section");
186 321 mdecorde
                                                                                pagedWriter.writeAttribute("type", "report");
187 321 mdecorde
                                                                                pagedWriter.writeAttribute("topic", "");
188 321 mdecorde
                                                                                pagedWriter.writeAttribute("startTime", "");
189 321 mdecorde
                                                                                pagedWriter.writeAttribute("endTime", "");
190 321 mdecorde
                                                                                for (def match : m) {
191 321 mdecorde
                                                                                        if (match.size() == 3) {
192 321 mdecorde
                                                                                                pagedWriter.writeAttribute(match[1], match[2]);
193 321 mdecorde
                                                                                        } else {
194 321 mdecorde
                                                                                                println "ERROR $lastTiming malformed section attribute "+trimedTxt
195 321 mdecorde
                                                                                        }
196 321 mdecorde
                                                                                }
197 321 mdecorde
                                                                        } else {
198 321 mdecorde
                                                                                println "ERROR $lastTiming malformed section "+trimedTxt
199 321 mdecorde
                                                                        }
200 321 mdecorde
                                                                } else if (allTxt.trim().length() > 0) {
201 321 mdecorde
                                                                        println "ERROR $lastTiming 'not in Turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $allTxt"
202 321 mdecorde
                                                                }
203 321 mdecorde
                                                        }
204 321 mdecorde
205 321 mdecorde
                                                        allTxt = ""
206 321 mdecorde
                                                        break
207 321 mdecorde
                                        }
208 321 mdecorde
                                        break;
209 321 mdecorde
210 321 mdecorde
                                case XMLStreamConstants.CHARACTERS:
211 321 mdecorde
                                        String txt = parser.getText().replaceAll("\n", " ")
212 321 mdecorde
                                        allTxt += txt.replaceAll("    ", "####")
213 321 mdecorde
                                        //println allTxt+"\n\n"
214 321 mdecorde
                                        if (!startTranscription && start && txt.matches("\\(.+:..:..\\..+\\)")) {
215 321 mdecorde
                                                println "START with $txt"
216 321 mdecorde
                                                startTranscription = true
217 321 mdecorde
218 321 mdecorde
                                                pagedWriter.writeStartElement("Episode");
219 321 mdecorde
                                                pagedWriter.writeAttribute("program", "NA");
220 321 mdecorde
                                                pagedWriter.writeAttribute("air_date", "NA");
221 321 mdecorde
                                                isThemeOpened = true
222 321 mdecorde
                                                pagedWriter.writeStartElement("Section");
223 321 mdecorde
                                                pagedWriter.writeAttribute("type", "report");
224 321 mdecorde
                                                pagedWriter.writeAttribute("topic", "");
225 321 mdecorde
                                                pagedWriter.writeAttribute("startTime", "0.0");
226 321 mdecorde
                                                pagedWriter.writeAttribute("endTime", "");
227 321 mdecorde
                                        }
228 321 mdecorde
                                        //println "$start $startTranscription "+txt.length()+" $startOfP"
229 321 mdecorde
                                        if (start && startTranscription && txt.length() > 0 && startOfP) {
230 321 mdecorde
                                                //println "first P: $txt $loc"
231 321 mdecorde
                                                startOfP = false
232 321 mdecorde
                                        }
233 321 mdecorde
                                        break;
234 321 mdecorde
                                default: break;
235 321 mdecorde
                        }
236 321 mdecorde
                }
237 321 mdecorde
                boolean ret = true
238 321 mdecorde
                if (startTranscription) {
239 321 mdecorde
                        if (isThemeOpened) { // close previous theme
240 321 mdecorde
                                pagedWriter.writeEndElement(); // Section
241 321 mdecorde
                        }
242 321 mdecorde
243 321 mdecorde
                        //pagedWriter.writeEndElement(); // Section
244 321 mdecorde
                        pagedWriter.writeEndElement(); // Episode
245 321 mdecorde
                        pagedWriter.writeEndElement(); // Trans
246 321 mdecorde
247 321 mdecorde
                        ret = true
248 321 mdecorde
                } else {
249 321 mdecorde
                        println "ERROR6: header not found"
250 321 mdecorde
                        ret = false;
251 321 mdecorde
                }
252 321 mdecorde
                pagedWriter.close()
253 321 mdecorde
                return ret
254 321 mdecorde
        }
255 321 mdecorde
256 321 mdecorde
        def writeTimeCode(String txt) {
257 321 mdecorde
                //println "write time code $txt"
258 321 mdecorde
                String str = txt.substring(1, txt.length()-1)
259 321 mdecorde
                Date date = formater.parse(str);
260 321 mdecorde
                lastTiming = str;
261 321 mdecorde
                if (inTurn || firstSync) {
262 321 mdecorde
                        pagedWriter.writeEmptyElement("Sync");
263 321 mdecorde
                        pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
264 321 mdecorde
                } else {
265 321 mdecorde
                        pagedWriter.writeStartElement("Turn");
266 321 mdecorde
                        pagedWriter.writeAttribute("speaker", "none");
267 321 mdecorde
                        pagedWriter.writeAttribute("startTime", "");
268 321 mdecorde
                        pagedWriter.writeAttribute("endTime", "");
269 321 mdecorde
                        inTurn = true
270 321 mdecorde
                        writeTimeCode(txt);
271 321 mdecorde
                        inTurn = false
272 321 mdecorde
                        pagedWriter.writeEndElement(); // endTurn
273 321 mdecorde
                        //println "ERROR: can't write Sync out of Turn $txt at "+parser.getLocation().getLineNumber()
274 321 mdecorde
                }
275 321 mdecorde
                firstSync = false
276 321 mdecorde
        }
277 321 mdecorde
278 321 mdecorde
        //                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
279 321 mdecorde
        def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)/
280 321 mdecorde
        def processCommentAndSync(String txt) {
281 321 mdecorde
                if (txt.length() == 0) return true;
282 321 mdecorde
                //println "process sync text: "+txt
283 321 mdecorde
                boolean ret = true;
284 321 mdecorde
                //find timing
285 321 mdecorde
                //find comments
286 321 mdecorde
                def allBySync = []
287 321 mdecorde
                def all = []
288 321 mdecorde
                def splits = []
289 321 mdecorde
                def lastMatch = 0
290 321 mdecorde
291 321 mdecorde
                // split bullet from txt
292 321 mdecorde
                def m = (txt =~ bullet_regex) // time
293 321 mdecorde
                while (m.find()) {
294 321 mdecorde
                        //println "found: "+m.group()
295 321 mdecorde
                           // grab the string in between the end of the last match
296 321 mdecorde
                           // and the start of the current one (empty string if none)
297 321 mdecorde
                           allBySync << txt.substring(lastMatch, m.start())
298 321 mdecorde
                           // grab the delimiter
299 321 mdecorde
                           allBySync << m.group()
300 321 mdecorde
                           // keep looking from the end of the current match
301 321 mdecorde
                           lastMatch = m.end()
302 321 mdecorde
                }
303 321 mdecorde
                all = allBySync
304 321 mdecorde
                // grab everything after the end of the last match
305 321 mdecorde
                all << txt.substring(lastMatch)
306 321 mdecorde
                //println "** $txt >> $all"
307 321 mdecorde
                for (String str : all) {
308 321 mdecorde
                        if (str.trim().length() == 0) continue
309 321 mdecorde
                        if (str.matches(bullet_regex)) {
310 321 mdecorde
                                //println "TIMING: $str"
311 321 mdecorde
                                writeTimeCode(str)
312 321 mdecorde
                        } else {
313 321 mdecorde
                                int idxOpen = str.indexOf("(");
314 321 mdecorde
                                int idxClose = str.indexOf(")")
315 321 mdecorde
316 321 mdecorde
                                while (idxOpen >=0 && idxClose > idxOpen) {
317 321 mdecorde
                                        if (str.substring(0, idxOpen).length() > 0) {
318 321 mdecorde
                                                if (loc != null) {
319 321 mdecorde
                                                        pagedWriter.writeCharacters(str.substring(0, idxOpen))
320 321 mdecorde
                                                        //println "SAY: "+str.substring(0, idxOpen)
321 321 mdecorde
                                                } else {
322 321 mdecorde
                                                        if (str.trim().length() > 0)
323 321 mdecorde
                                                                println "WARNING $lastTiming no locutor at line "+parser.getLocation().getLineNumber()+" : $str"
324 321 mdecorde
                                                }
325 321 mdecorde
                                        }
326 321 mdecorde
                                        pagedWriter.writeEmptyElement("Comment")
327 321 mdecorde
                                        pagedWriter.writeAttribute("desc", str.substring(idxOpen+1, idxClose))
328 321 mdecorde
                                        //println "COMMENT: "+str.substring(idxOpen+1, idxClose)
329 321 mdecorde
                                        str = str.substring(idxClose+1)
330 321 mdecorde
                                        idxOpen = str.indexOf("(");
331 321 mdecorde
                                        idxClose = str.indexOf(")")
332 321 mdecorde
                                }
333 321 mdecorde
                                if (str.length() > 0) {
334 321 mdecorde
                                        if (loc != null) {
335 321 mdecorde
                                                pagedWriter.writeCharacters(str)
336 321 mdecorde
                                        //        println "SAY: "+str
337 321 mdecorde
                                        } else {
338 321 mdecorde
                                                if (str.trim().length() > 0)
339 321 mdecorde
                                                        println "ERROR $lastTiming 'not in turn and not a comment' line "+parser.getLocation().getLineNumber()+" : $str"
340 321 mdecorde
                                        }
341 321 mdecorde
                                }
342 321 mdecorde
                                if (str.contains("(") || str.contains("[") || str.contains(")") || str.contains("]")) {
343 321 mdecorde
                                        ret = false;
344 321 mdecorde
                                }
345 321 mdecorde
                  }
346 321 mdecorde
                }
347 321 mdecorde
                return ret;
348 321 mdecorde
        }
349 321 mdecorde
}