Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / TextTranscription2TRS.groovy @ 479

History | View | Annotate | Download (8.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import org.kohsuke.args4j.*
25

    
26
import groovy.transform.Field
27

    
28
import org.txm.rcpapplication.swt.widget.parameters.*
29
import org.txm.utils.xml.DomUtils;
30
import org.w3c.tidy.Tidy
31
import org.txm.doc.*;
32

    
33
import javax.xml.stream.*;
34

    
35
import java.util.HashMap;
36
import java.util.List;
37

    
38
import org.txm.importer.*;
39
import org.xml.sax.Attributes;
40
import org.txm.importer.filters.*;
41

    
42
import java.io.File;
43
import java.io.IOException;
44
import java.util.ArrayList;
45

    
46
import javax.xml.parsers.SAXParserFactory;
47
import javax.xml.parsers.ParserConfigurationException;
48
import javax.xml.parsers.SAXParser;
49

    
50
import java.net.URL;
51

    
52
import org.xml.sax.InputSource;
53
import org.xml.sax.helpers.DefaultHandler;
54

    
55
import java.text.DecimalFormat;
56
import java.text.ParseException;
57
import java.text.SimpleDateFormat;
58
import java.util.Date;
59

    
60
class TextTranscription2TRS {
61
        boolean debug = false;
62
        boolean isDirectory = false;
63
        File outDir;
64
        ConvertDocument converter;
65
        def monitor
66

    
67
        TextTranscription2TRS(File outDir, boolean debug, def monitor) {
68
                this.outDir = outDir
69
                this.monitor = monitor
70
                this.debug = debug
71
        }
72

    
73
        boolean process(File file) {
74
                try {
75
                        isDirectory = file.isDirectory();
76
                        if (isDirectory) {
77
                                return processDir(file)
78
                        } else {
79
                                return processFile(file, 100)
80
                        }
81
                } catch(Exception e) {
82
                        println "Error while processing: "+e
83
                        if (debug) e.printStackTrace()
84
                }
85
        }
86

    
87
        boolean processDir(File dir) {
88
                boolean ret = true
89
                try {
90
                        println "* Processing $dir directory"
91
                        converter = new ConvertDocument();
92
                        def files = dir.listFiles()
93
                        def okfiles = [];
94
                        if (files != null)
95
                                for (File file : files) {
96
                                        String fname = file.getName().toLowerCase()
97
                                        if ( file.isFile() &&
98
                                        (fname.endsWith(".odt") || fname.endsWith(".doc") || fname.endsWith(".rtf") || fname.endsWith(".txt"))) {
99
                                                okfiles << file
100
                                        }
101

    
102
                                }
103

    
104
                        if (okfiles.size() == 0) {
105
                                println "No ODT/DOC/RTF file found in $dir"
106
                                return false
107
                        }
108

    
109
                        int delta = 100 / okfiles.size()
110
                        for (File file : okfiles) {
111
                                if (monitor != null) monitor.worked(0, "Processing file: $file")
112
                                if (!processFile(file, delta)) {
113
                                        ret = false;
114
                                        println "ERROR: Failed to process $file"
115
                                }
116

    
117
                                if (monitor != null && monitor.isCanceled()) {
118
                                        break; // stop looping
119
                                }
120
                        }
121
                } catch(Exception e) {
122
                        println "Error while processing directory: "+e;
123
                        if (debug) e.printStackTrace();
124
                }
125
                if (converter != null) converter.stop();
126
                return ret;
127
        }
128

    
129
        boolean processFile(File docFile, int work) {
130
                if (docFile == null) return false;
131
                int delta = work / 5
132
                println "** Processing $docFile file"
133
                String filename = docFile.getName()
134
                int idx = filename.lastIndexOf(".")
135
                if (idx < 0) return false;
136
                String ext = filename.substring(idx)
137
                filename = filename.substring(0, idx)
138

    
139
                //File docFile = new File(srcDir, "${filename}$ext")
140
                //                File htmlFile = new File(outDir, "${filename}.html")
141
                //                File htmlFile2 = new File(outDir, "${filename}_temp.html")
142
                //                File xhtmlFile = new File(outDir, "${filename}.xhtml")
143
                //                File xmlFile = new File(outDir, "${filename}.xml")
144
                File txtFile = new File(outDir, "${filename}.txt")
145
                File trsFile = new File(outDir, "${filename}-tmp.trs")
146
                File finalTrsFile = new File(outDir, "${filename}.trs")
147
                boolean ret = false;
148
                
149
                if (!docFile.getName().toLowerCase().endsWith("txt")) {
150
                        if (monitor != null) monitor.worked(0, "DOCtoTXT: "+docFile.getName())
151

    
152
                        ret = DOCtoTXT(docFile, txtFile)
153
                        if (monitor != null) monitor.worked(delta, "DOCtoTXT: "+docFile.getName())
154
                        if (monitor != null && monitor.isCanceled()) {
155
                                println "Process interrupted by user"
156
                                return false;
157
                        }
158
                        if (!ret) return ret;
159
                } else { // the document is already a TXT file
160
                        org.txm.utils.FileCopy.copy(docFile, txtFile)
161
                        ret = true
162
                }
163

    
164
                ret = ret && TXTtoTRS(txtFile, trsFile)
165
                if (monitor != null) monitor.worked(delta, "TXTtoTRS: "+docFile.getName())
166
                if (monitor != null && monitor.isCanceled()) {
167
                        println "Process interrupted by user"
168
                        return false;
169
                }
170
                if (!ret) return ret;
171

    
172
                ret = ret && TRStoFixedTRS(trsFile, finalTrsFile);
173
                if (monitor != null) monitor.worked(delta, "Done: "+docFile.getName())
174
                if (monitor != null && monitor.isCanceled()) {
175
                        println "Process interrupted by user"
176
                        return false;
177
                }
178
                if (!ret) return ret;
179

    
180
                // cleaning
181
                System.gc()
182
                if (!debug) {
183
                        //htmlFile.delete();htmlFile2.delete();xhtmlFile.delete();xmlFile.delete();
184
                        txtFile.delete()
185
                        trsFile.delete()
186
                }
187

    
188
                return ret
189
        }
190

    
191
        boolean DOCtoTXT(File docFile, File txtFile) {
192
                println "*** ODT -> TXT"
193
                try {
194
                        if (!isDirectory) converter = new ConvertDocument();
195
                        converter.setDebug(debug)
196
                        converter.autoFile(docFile, txtFile, "txt")
197
                } catch(Exception e) {
198
                        println "Error while converting $docFile : $e"
199
                        if (debug) e.printStackTrace()
200
                } finally {
201
                        if (!isDirectory && converter != null) converter.stop()
202
                }
203
                return txtFile.exists() && txtFile.length() > 0
204
        }
205

    
206
//        boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) {
207
//                println "*** HTML -> HTML for tidy"
208
//                try {
209
//                        println "replace TABS with 4 spaces"
210
//                        String text2 = htmlFile.getText("UTF-8")
211
//                        text2 = text2.replaceAll("&nbsp;", " ")
212
//                        text2 = text2.replaceAll("’", "'")
213
//                        text2 = text2.replaceAll("&rsquo;", "'")
214
//                        text2 = text2.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
215
//                        text2 = text2.replaceAll("\n", '&nbsp;')
216
//                        text2 = text2.replaceAll("\r\n", '&nbsp;')
217
//                        htmlFile2.withWriter("UTF-8") { writer ->
218
//                                writer.write(text2);
219
//                        }
220
//                } catch(Exception e) {
221
//                        println "Error while preparing HTML of $htmlFile : $e"
222
//                        if (debug) e.printStackTrace()
223
//                }
224
//                return htmlFile2.exists() && htmlFile2.length() > 0
225
//        }
226
//
227
//        boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) {
228
//                println "*** HTML for tidy -> XHTML"
229
//                try {
230
//                        Tidy tidy = new Tidy(); // obtain a new Tidy instance
231
//                        tidy.setXHTML(true); // set desired config options using tidy setters
232
//                        tidy.setInputEncoding("UTF-8")
233
//                        tidy.setOutputEncoding("UTF-8")
234
//                        tidy.setShowErrors(100)
235
//                        tidy.setShowWarnings(debug)
236
//                        tidy.setTabsize(10)
237
//                        tidy.setWraplen(9999)
238
//                        tidy.setForceOutput(true) // Tidy won't stop if error are found
239
//                        xhtmlFile.withWriter("UTF-8") { out ->
240
//                                def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8")
241
//                                tidy.parse(input, out); // run tidy, providing an input and output stream
242
//                        }
243
//                        if (xhtmlFile.exists()) {
244
//                                // JTidy produced a "0x0" char. removing them
245
//                                // fix separated < and / ???
246
//                                def c = Character.toChars(0)[0]
247
//                                String txttmp = xhtmlFile.getText("UTF-8");
248
//                                xhtmlFile.withWriter("UTF-8") { out ->
249
//                                        out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</"))
250
//                                }
251
//                        }
252
//                } catch(Exception e) {
253
//                        println "Error while applying JTidy: "+e
254
//                        if (debug) e.printStackTrace()
255
//                }
256
//                return xhtmlFile.exists() && xhtmlFile.length() > 0
257
//        }
258

    
259
        boolean TXTtoTRS(File txtFile, File trsFile) {
260
                println "*** TXT -> TRS"
261
                try {
262
                        TXT2TRS ht = new TXT2TRS();
263
                        ht.process(txtFile, trsFile);
264
                } catch(Exception e) {
265
                        println "Error while creating TRS file: "+e
266
                        if (debug) e.printStackTrace()
267
                }
268
                return trsFile.exists() && trsFile.length() > 0
269
        }
270

    
271
        boolean TRStoFixedTRS(File trsFile, File finalTrsFile) {
272
                println "*** TRS -> FIXED TRS"
273
                try {
274
                        def timingfixer = new SetTRSTurnTimingsAndSpeakers(trsFile, finalTrsFile);
275
                        if (!timingfixer.process()) {
276
                                println "Error while fixing timing"
277
                        }
278
                } catch(Exception e) {
279
                        println "Error while fixing TRS file: "+e
280
                        if (debug) e.printStackTrace()
281
                }
282
                return finalTrsFile.exists() && finalTrsFile.length() > 0
283
        }
284
}
285