Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macro / transcription / TextTranscription2TRS.groovy @ 187

History | View | Annotate | Download (7.8 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import javax.xml.stream.*
25

    
26
import org.kohsuke.args4j.*
27
import org.txm.doc.*
28
import org.txm.importer.*
29
import org.txm.importer.filters.*
30
import org.txm.rcpapplication.swt.widget.parameters.*
31

    
32
class TextTranscription2TRS {
33
        boolean debug = false;
34
        boolean isDirectory = false;
35
        File outDir;
36
        ConvertDocument converter;
37
        def monitor
38

    
39
        TextTranscription2TRS(File outDir, boolean debug, def monitor) {
40
                this.outDir = outDir
41
                this.monitor = monitor
42
                this.debug = debug
43
        }
44

    
45
        boolean process(File file) {
46
                try {
47
                        isDirectory = file.isDirectory();
48
                        if (isDirectory) {
49
                                return processDir(file)
50
                        } else {
51
                                return processFile(file, 100)
52
                        }
53
                } catch(Exception e) {
54
                        println "Error while processing: "+e
55
                        if (debug) e.printStackTrace()
56
                }
57
        }
58

    
59
        boolean processDir(File dir) {
60
                boolean ret = true
61
                try {
62
                        println "* Processing $dir directory"
63
                        converter = new ConvertDocument();
64
                        def files = dir.listFiles()
65
                        def okfiles = [];
66
                        if (files != null)
67
                                for (File file : files) {
68
                                        String fname = file.getName().toLowerCase()
69
                                        if ( file.isFile() &&
70
                                        (fname.endsWith(".odt") || fname.endsWith(".doc") || fname.endsWith(".rtf") || fname.endsWith(".txt"))) {
71
                                                okfiles << file
72
                                        }
73

    
74
                                }
75

    
76
                        if (okfiles.size() == 0) {
77
                                println "No ODT/DOC/RTF file found in $dir"
78
                                return false
79
                        }
80

    
81
                        int delta = 100 / okfiles.size()
82
                        for (File file : okfiles) {
83
                                if (monitor != null) monitor.worked(0, "Processing file: $file")
84
                                if (!processFile(file, delta)) {
85
                                        ret = false;
86
                                        println "ERROR: Failed to process $file"
87
                                }
88

    
89
                                if (monitor != null && monitor.isCanceled()) {
90
                                        break; // stop looping
91
                                }
92
                        }
93
                } catch(Exception e) {
94
                        println "Error while processing directory: "+e;
95
                        if (debug) e.printStackTrace();
96
                }
97
                if (converter != null) converter.stop();
98
                return ret;
99
        }
100

    
101
        boolean processFile(File docFile, int work) {
102
                if (docFile == null) return false;
103
                int delta = work / 5
104
                println "** Processing $docFile file"
105
                String filename = docFile.getName()
106
                int idx = filename.indexOf(".")
107
                if (idx < 0) return false;
108
                String ext = filename.substring(idx)
109
                filename = filename.substring(0, idx)
110

    
111
                //File docFile = new File(srcDir, "${filename}$ext")
112
                //                File htmlFile = new File(outDir, "${filename}.html")
113
                //                File htmlFile2 = new File(outDir, "${filename}_temp.html")
114
                //                File xhtmlFile = new File(outDir, "${filename}.xhtml")
115
                //                File xmlFile = new File(outDir, "${filename}.xml")
116
                File txtFile = new File(outDir, "${filename}.txt")
117
                File trsFile = new File(outDir, "${filename}-tmp.trs")
118
                File finalTrsFile = new File(outDir, "${filename}.trs")
119
                boolean ret = false;
120
                
121
                if (!docFile.getName().toLowerCase().endsWith("txt")) {
122
                        if (monitor != null) monitor.worked(0, "DOCtoTXT: "+docFile.getName())
123

    
124
                        ret = DOCtoTXT(docFile, txtFile)
125
                        if (monitor != null) monitor.worked(delta, "DOCtoTXT: "+docFile.getName())
126
                        if (monitor != null && monitor.isCanceled()) {
127
                                println "Process interrupted by user"
128
                                return false;
129
                        }
130
                        if (!ret) return ret;
131
                } else { // the document is already a TXT file
132
                        org.txm.utils.FileCopy.copy(docFile, txtFile)
133
                        ret = true
134
                }
135

    
136
                ret = ret && TXTtoTRS(txtFile, trsFile)
137
                if (monitor != null) monitor.worked(delta, "TXTtoTRS: "+docFile.getName())
138
                if (monitor != null && monitor.isCanceled()) {
139
                        println "Process interrupted by user"
140
                        return false;
141
                }
142
                if (!ret) return ret;
143

    
144
                ret = ret && TRStoFixedTRS(trsFile, finalTrsFile);
145
                if (monitor != null) monitor.worked(delta, "Done: "+docFile.getName())
146
                if (monitor != null && monitor.isCanceled()) {
147
                        println "Process interrupted by user"
148
                        return false;
149
                }
150
                if (!ret) return ret;
151

    
152
                // cleaning
153
                System.gc()
154
                if (!debug) {
155
                        //htmlFile.delete();htmlFile2.delete();xhtmlFile.delete();xmlFile.delete();
156
                        txtFile.delete()
157
                        trsFile.delete()
158
                }
159

    
160
                return ret
161
        }
162

    
163
        boolean DOCtoTXT(File docFile, File txtFile) {
164
                println "*** ODT -> TXT"
165
                try {
166
                        if (!isDirectory) converter = new ConvertDocument();
167
                        converter.setDebug(debug)
168
                        converter.autoFile(docFile, txtFile, "txt")
169
                } catch(Exception e) {
170
                        println "Error while converting $docFile : $e"
171
                        if (debug) e.printStackTrace()
172
                } finally {
173
                        if (!isDirectory && converter != null) converter.stop()
174
                }
175
                return txtFile.exists() && txtFile.length() > 0
176
        }
177

    
178
//        boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) {
179
//                println "*** HTML -> HTML for tidy"
180
//                try {
181
//                        println "replace TABS with 4 spaces"
182
//                        String text2 = htmlFile.getText("UTF-8")
183
//                        text2 = text2.replaceAll("&nbsp;", " ")
184
//                        text2 = text2.replaceAll("’", "'")
185
//                        text2 = text2.replaceAll("&rsquo;", "'")
186
//                        text2 = text2.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
187
//                        text2 = text2.replaceAll("\n", '&nbsp;')
188
//                        text2 = text2.replaceAll("\r\n", '&nbsp;')
189
//                        htmlFile2.withWriter("UTF-8") { writer ->
190
//                                writer.write(text2);
191
//                        }
192
//                } catch(Exception e) {
193
//                        println "Error while preparing HTML of $htmlFile : $e"
194
//                        if (debug) e.printStackTrace()
195
//                }
196
//                return htmlFile2.exists() && htmlFile2.length() > 0
197
//        }
198
//
199
//        boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) {
200
//                println "*** HTML for tidy -> XHTML"
201
//                try {
202
//                        Tidy tidy = new Tidy(); // obtain a new Tidy instance
203
//                        tidy.setXHTML(true); // set desired config options using tidy setters
204
//                        tidy.setInputEncoding("UTF-8")
205
//                        tidy.setOutputEncoding("UTF-8")
206
//                        tidy.setShowErrors(100)
207
//                        tidy.setShowWarnings(debug)
208
//                        tidy.setTabsize(10)
209
//                        tidy.setWraplen(9999)
210
//                        tidy.setForceOutput(true) // Tidy won't stop if error are found
211
//                        xhtmlFile.withWriter("UTF-8") { out ->
212
//                                def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8")
213
//                                tidy.parse(input, out); // run tidy, providing an input and output stream
214
//                        }
215
//                        if (xhtmlFile.exists()) {
216
//                                // JTidy produced a "0x0" char. removing them
217
//                                // fix separated < and / ???
218
//                                def c = Character.toChars(0)[0]
219
//                                String txttmp = xhtmlFile.getText("UTF-8");
220
//                                xhtmlFile.withWriter("UTF-8") { out ->
221
//                                        out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</"))
222
//                                }
223
//                        }
224
//                } catch(Exception e) {
225
//                        println "Error while applying JTidy: "+e
226
//                        if (debug) e.printStackTrace()
227
//                }
228
//                return xhtmlFile.exists() && xhtmlFile.length() > 0
229
//        }
230

    
231
        boolean TXTtoTRS(File txtFile, File trsFile) {
232
                println "*** TXT -> TRS"
233
                try {
234
                        TXT2TRS ht = new TXT2TRS();
235
                        ht.process(txtFile, trsFile);
236
                } catch(Exception e) {
237
                        println "Error while creating TRS file: "+e
238
                        if (debug) e.printStackTrace()
239
                }
240
                return trsFile.exists() && trsFile.length() > 0
241
        }
242

    
243
        boolean TRStoFixedTRS(File trsFile, File finalTrsFile) {
244
                println "*** TRS -> FIXED TRS"
245
                try {
246
                        def timingfixer = new SetTRSTurnTimingsAndSpeakers(trsFile, finalTrsFile);
247
                        if (!timingfixer.process()) {
248
                                println "Error while fixing timing"
249
                        }
250
                } catch(Exception e) {
251
                        println "Error while fixing TRS file: "+e
252
                        if (debug) e.printStackTrace()
253
                }
254
                return finalTrsFile.exists() && finalTrsFile.length() > 0
255
        }
256
}
257