Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / TextTranscription2TRS.groovy @ 479

History | View | Annotate | Download (8.4 kB)

1 321 mdecorde
// Copyright © 2010-2013 ENS de Lyon.
2 321 mdecorde
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3 321 mdecorde
// Lyon 2, University of Franche-Comté, University of Nice
4 321 mdecorde
// Sophia Antipolis, University of Paris 3.
5 321 mdecorde
//
6 321 mdecorde
// The TXM platform is free software: you can redistribute it
7 321 mdecorde
// and/or modify it under the terms of the GNU General Public
8 321 mdecorde
// License as published by the Free Software Foundation,
9 321 mdecorde
// either version 2 of the License, or (at your option) any
10 321 mdecorde
// later version.
11 321 mdecorde
//
12 321 mdecorde
// The TXM platform is distributed in the hope that it will be
13 321 mdecorde
// useful, but WITHOUT ANY WARRANTY; without even the implied
14 321 mdecorde
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15 321 mdecorde
// PURPOSE. See the GNU General Public License for more
16 321 mdecorde
// details.
17 321 mdecorde
//
18 321 mdecorde
// You should have received a copy of the GNU General
19 321 mdecorde
// Public License along with the TXM platform. If not, see
20 321 mdecorde
// http://www.gnu.org/licenses.
21 321 mdecorde
22 321 mdecorde
package org.txm.macro.transcription
23 321 mdecorde
24 321 mdecorde
import org.kohsuke.args4j.*
25 321 mdecorde
26 479 mdecorde
import groovy.transform.Field
27 479 mdecorde
28 479 mdecorde
import org.txm.rcpapplication.swt.widget.parameters.*
29 479 mdecorde
import org.txm.utils.xml.DomUtils;
30 479 mdecorde
import org.w3c.tidy.Tidy
31 479 mdecorde
import org.txm.doc.*;
32 479 mdecorde
33 479 mdecorde
import javax.xml.stream.*;
34 479 mdecorde
35 479 mdecorde
import java.util.HashMap;
36 479 mdecorde
import java.util.List;
37 479 mdecorde
38 479 mdecorde
import org.txm.importer.*;
39 479 mdecorde
import org.xml.sax.Attributes;
40 479 mdecorde
import org.txm.importer.filters.*;
41 479 mdecorde
42 479 mdecorde
import java.io.File;
43 479 mdecorde
import java.io.IOException;
44 479 mdecorde
import java.util.ArrayList;
45 479 mdecorde
46 479 mdecorde
import javax.xml.parsers.SAXParserFactory;
47 479 mdecorde
import javax.xml.parsers.ParserConfigurationException;
48 479 mdecorde
import javax.xml.parsers.SAXParser;
49 479 mdecorde
50 479 mdecorde
import java.net.URL;
51 479 mdecorde
52 479 mdecorde
import org.xml.sax.InputSource;
53 479 mdecorde
import org.xml.sax.helpers.DefaultHandler;
54 479 mdecorde
55 479 mdecorde
import java.text.DecimalFormat;
56 479 mdecorde
import java.text.ParseException;
57 479 mdecorde
import java.text.SimpleDateFormat;
58 479 mdecorde
import java.util.Date;
59 479 mdecorde
60 321 mdecorde
class TextTranscription2TRS {
61 321 mdecorde
        boolean debug = false;
62 321 mdecorde
        boolean isDirectory = false;
63 321 mdecorde
        File outDir;
64 321 mdecorde
        ConvertDocument converter;
65 321 mdecorde
        def monitor
66 321 mdecorde
67 321 mdecorde
        TextTranscription2TRS(File outDir, boolean debug, def monitor) {
68 321 mdecorde
                this.outDir = outDir
69 321 mdecorde
                this.monitor = monitor
70 321 mdecorde
                this.debug = debug
71 321 mdecorde
        }
72 321 mdecorde
73 321 mdecorde
        boolean process(File file) {
74 321 mdecorde
                try {
75 321 mdecorde
                        isDirectory = file.isDirectory();
76 321 mdecorde
                        if (isDirectory) {
77 321 mdecorde
                                return processDir(file)
78 321 mdecorde
                        } else {
79 321 mdecorde
                                return processFile(file, 100)
80 321 mdecorde
                        }
81 321 mdecorde
                } catch(Exception e) {
82 321 mdecorde
                        println "Error while processing: "+e
83 321 mdecorde
                        if (debug) e.printStackTrace()
84 321 mdecorde
                }
85 321 mdecorde
        }
86 321 mdecorde
87 321 mdecorde
        boolean processDir(File dir) {
88 321 mdecorde
                boolean ret = true
89 321 mdecorde
                try {
90 321 mdecorde
                        println "* Processing $dir directory"
91 321 mdecorde
                        converter = new ConvertDocument();
92 321 mdecorde
                        def files = dir.listFiles()
93 321 mdecorde
                        def okfiles = [];
94 321 mdecorde
                        if (files != null)
95 321 mdecorde
                                for (File file : files) {
96 321 mdecorde
                                        String fname = file.getName().toLowerCase()
97 321 mdecorde
                                        if ( file.isFile() &&
98 321 mdecorde
                                        (fname.endsWith(".odt") || fname.endsWith(".doc") || fname.endsWith(".rtf") || fname.endsWith(".txt"))) {
99 321 mdecorde
                                                okfiles << file
100 321 mdecorde
                                        }
101 321 mdecorde
102 321 mdecorde
                                }
103 321 mdecorde
104 321 mdecorde
                        if (okfiles.size() == 0) {
105 321 mdecorde
                                println "No ODT/DOC/RTF file found in $dir"
106 321 mdecorde
                                return false
107 321 mdecorde
                        }
108 321 mdecorde
109 321 mdecorde
                        int delta = 100 / okfiles.size()
110 321 mdecorde
                        for (File file : okfiles) {
111 321 mdecorde
                                if (monitor != null) monitor.worked(0, "Processing file: $file")
112 321 mdecorde
                                if (!processFile(file, delta)) {
113 321 mdecorde
                                        ret = false;
114 321 mdecorde
                                        println "ERROR: Failed to process $file"
115 321 mdecorde
                                }
116 321 mdecorde
117 321 mdecorde
                                if (monitor != null && monitor.isCanceled()) {
118 321 mdecorde
                                        break; // stop looping
119 321 mdecorde
                                }
120 321 mdecorde
                        }
121 321 mdecorde
                } catch(Exception e) {
122 321 mdecorde
                        println "Error while processing directory: "+e;
123 321 mdecorde
                        if (debug) e.printStackTrace();
124 321 mdecorde
                }
125 321 mdecorde
                if (converter != null) converter.stop();
126 321 mdecorde
                return ret;
127 321 mdecorde
        }
128 321 mdecorde
129 321 mdecorde
        boolean processFile(File docFile, int work) {
130 321 mdecorde
                if (docFile == null) return false;
131 321 mdecorde
                int delta = work / 5
132 321 mdecorde
                println "** Processing $docFile file"
133 321 mdecorde
                String filename = docFile.getName()
134 479 mdecorde
                int idx = filename.lastIndexOf(".")
135 321 mdecorde
                if (idx < 0) return false;
136 321 mdecorde
                String ext = filename.substring(idx)
137 321 mdecorde
                filename = filename.substring(0, idx)
138 321 mdecorde
139 321 mdecorde
                //File docFile = new File(srcDir, "${filename}$ext")
140 321 mdecorde
                //                File htmlFile = new File(outDir, "${filename}.html")
141 321 mdecorde
                //                File htmlFile2 = new File(outDir, "${filename}_temp.html")
142 321 mdecorde
                //                File xhtmlFile = new File(outDir, "${filename}.xhtml")
143 321 mdecorde
                //                File xmlFile = new File(outDir, "${filename}.xml")
144 321 mdecorde
                File txtFile = new File(outDir, "${filename}.txt")
145 321 mdecorde
                File trsFile = new File(outDir, "${filename}-tmp.trs")
146 321 mdecorde
                File finalTrsFile = new File(outDir, "${filename}.trs")
147 321 mdecorde
                boolean ret = false;
148 321 mdecorde
149 321 mdecorde
                if (!docFile.getName().toLowerCase().endsWith("txt")) {
150 321 mdecorde
                        if (monitor != null) monitor.worked(0, "DOCtoTXT: "+docFile.getName())
151 321 mdecorde
152 321 mdecorde
                        ret = DOCtoTXT(docFile, txtFile)
153 321 mdecorde
                        if (monitor != null) monitor.worked(delta, "DOCtoTXT: "+docFile.getName())
154 321 mdecorde
                        if (monitor != null && monitor.isCanceled()) {
155 321 mdecorde
                                println "Process interrupted by user"
156 321 mdecorde
                                return false;
157 321 mdecorde
                        }
158 321 mdecorde
                        if (!ret) return ret;
159 321 mdecorde
                } else { // the document is already a TXT file
160 479 mdecorde
                        org.txm.utils.FileCopy.copy(docFile, txtFile)
161 321 mdecorde
                        ret = true
162 321 mdecorde
                }
163 321 mdecorde
164 321 mdecorde
                ret = ret && TXTtoTRS(txtFile, trsFile)
165 321 mdecorde
                if (monitor != null) monitor.worked(delta, "TXTtoTRS: "+docFile.getName())
166 321 mdecorde
                if (monitor != null && monitor.isCanceled()) {
167 321 mdecorde
                        println "Process interrupted by user"
168 321 mdecorde
                        return false;
169 321 mdecorde
                }
170 321 mdecorde
                if (!ret) return ret;
171 321 mdecorde
172 321 mdecorde
                ret = ret && TRStoFixedTRS(trsFile, finalTrsFile);
173 321 mdecorde
                if (monitor != null) monitor.worked(delta, "Done: "+docFile.getName())
174 321 mdecorde
                if (monitor != null && monitor.isCanceled()) {
175 321 mdecorde
                        println "Process interrupted by user"
176 321 mdecorde
                        return false;
177 321 mdecorde
                }
178 321 mdecorde
                if (!ret) return ret;
179 321 mdecorde
180 321 mdecorde
                // cleaning
181 321 mdecorde
                System.gc()
182 321 mdecorde
                if (!debug) {
183 321 mdecorde
                        //htmlFile.delete();htmlFile2.delete();xhtmlFile.delete();xmlFile.delete();
184 321 mdecorde
                        txtFile.delete()
185 321 mdecorde
                        trsFile.delete()
186 321 mdecorde
                }
187 321 mdecorde
188 321 mdecorde
                return ret
189 321 mdecorde
        }
190 321 mdecorde
191 321 mdecorde
        boolean DOCtoTXT(File docFile, File txtFile) {
192 321 mdecorde
                println "*** ODT -> TXT"
193 321 mdecorde
                try {
194 321 mdecorde
                        if (!isDirectory) converter = new ConvertDocument();
195 321 mdecorde
                        converter.setDebug(debug)
196 321 mdecorde
                        converter.autoFile(docFile, txtFile, "txt")
197 321 mdecorde
                } catch(Exception e) {
198 321 mdecorde
                        println "Error while converting $docFile : $e"
199 321 mdecorde
                        if (debug) e.printStackTrace()
200 321 mdecorde
                } finally {
201 321 mdecorde
                        if (!isDirectory && converter != null) converter.stop()
202 321 mdecorde
                }
203 321 mdecorde
                return txtFile.exists() && txtFile.length() > 0
204 321 mdecorde
        }
205 321 mdecorde
206 321 mdecorde
//        boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) {
207 321 mdecorde
//                println "*** HTML -> HTML for tidy"
208 321 mdecorde
//                try {
209 321 mdecorde
//                        println "replace TABS with 4 spaces"
210 321 mdecorde
//                        String text2 = htmlFile.getText("UTF-8")
211 321 mdecorde
//                        text2 = text2.replaceAll("&nbsp;", " ")
212 321 mdecorde
//                        text2 = text2.replaceAll("’", "'")
213 321 mdecorde
//                        text2 = text2.replaceAll("&rsquo;", "'")
214 321 mdecorde
//                        text2 = text2.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
215 321 mdecorde
//                        text2 = text2.replaceAll("\n", '&nbsp;')
216 321 mdecorde
//                        text2 = text2.replaceAll("\r\n", '&nbsp;')
217 321 mdecorde
//                        htmlFile2.withWriter("UTF-8") { writer ->
218 321 mdecorde
//                                writer.write(text2);
219 321 mdecorde
//                        }
220 321 mdecorde
//                } catch(Exception e) {
221 321 mdecorde
//                        println "Error while preparing HTML of $htmlFile : $e"
222 321 mdecorde
//                        if (debug) e.printStackTrace()
223 321 mdecorde
//                }
224 321 mdecorde
//                return htmlFile2.exists() && htmlFile2.length() > 0
225 321 mdecorde
//        }
226 321 mdecorde
//
227 321 mdecorde
//        boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) {
228 321 mdecorde
//                println "*** HTML for tidy -> XHTML"
229 321 mdecorde
//                try {
230 321 mdecorde
//                        Tidy tidy = new Tidy(); // obtain a new Tidy instance
231 321 mdecorde
//                        tidy.setXHTML(true); // set desired config options using tidy setters
232 321 mdecorde
//                        tidy.setInputEncoding("UTF-8")
233 321 mdecorde
//                        tidy.setOutputEncoding("UTF-8")
234 321 mdecorde
//                        tidy.setShowErrors(100)
235 321 mdecorde
//                        tidy.setShowWarnings(debug)
236 321 mdecorde
//                        tidy.setTabsize(10)
237 321 mdecorde
//                        tidy.setWraplen(9999)
238 321 mdecorde
//                        tidy.setForceOutput(true) // Tidy won't stop if error are found
239 321 mdecorde
//                        xhtmlFile.withWriter("UTF-8") { out ->
240 321 mdecorde
//                                def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8")
241 321 mdecorde
//                                tidy.parse(input, out); // run tidy, providing an input and output stream
242 321 mdecorde
//                        }
243 321 mdecorde
//                        if (xhtmlFile.exists()) {
244 321 mdecorde
//                                // JTidy produced a "0x0" char. removing them
245 321 mdecorde
//                                // fix separated < and / ???
246 321 mdecorde
//                                def c = Character.toChars(0)[0]
247 321 mdecorde
//                                String txttmp = xhtmlFile.getText("UTF-8");
248 321 mdecorde
//                                xhtmlFile.withWriter("UTF-8") { out ->
249 321 mdecorde
//                                        out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</"))
250 321 mdecorde
//                                }
251 321 mdecorde
//                        }
252 321 mdecorde
//                } catch(Exception e) {
253 321 mdecorde
//                        println "Error while applying JTidy: "+e
254 321 mdecorde
//                        if (debug) e.printStackTrace()
255 321 mdecorde
//                }
256 321 mdecorde
//                return xhtmlFile.exists() && xhtmlFile.length() > 0
257 321 mdecorde
//        }
258 321 mdecorde
259 321 mdecorde
        boolean TXTtoTRS(File txtFile, File trsFile) {
260 321 mdecorde
                println "*** TXT -> TRS"
261 321 mdecorde
                try {
262 321 mdecorde
                        TXT2TRS ht = new TXT2TRS();
263 321 mdecorde
                        ht.process(txtFile, trsFile);
264 321 mdecorde
                } catch(Exception e) {
265 321 mdecorde
                        println "Error while creating TRS file: "+e
266 321 mdecorde
                        if (debug) e.printStackTrace()
267 321 mdecorde
                }
268 321 mdecorde
                return trsFile.exists() && trsFile.length() > 0
269 321 mdecorde
        }
270 321 mdecorde
271 321 mdecorde
        boolean TRStoFixedTRS(File trsFile, File finalTrsFile) {
272 321 mdecorde
                println "*** TRS -> FIXED TRS"
273 321 mdecorde
                try {
274 321 mdecorde
                        def timingfixer = new SetTRSTurnTimingsAndSpeakers(trsFile, finalTrsFile);
275 321 mdecorde
                        if (!timingfixer.process()) {
276 321 mdecorde
                                println "Error while fixing timing"
277 321 mdecorde
                        }
278 321 mdecorde
                } catch(Exception e) {
279 321 mdecorde
                        println "Error while fixing TRS file: "+e
280 321 mdecorde
                        if (debug) e.printStackTrace()
281 321 mdecorde
                }
282 321 mdecorde
                return finalTrsFile.exists() && finalTrsFile.length() > 0
283 321 mdecorde
        }
284 321 mdecorde
}