Statistics
| Revision:

root / tmp / org.txm.core / src / groovy / org / txm / macro / transcription / Transana2TRS.groovy @ 187

History | View | Annotate | Download (7.7 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import javax.xml.stream.*
25

    
26
import org.kohsuke.args4j.*
27
import org.txm.doc.*
28
import org.txm.importer.*
29
import org.txm.importer.filters.*
30
import org.txm.rcpapplication.swt.widget.parameters.*
31
import org.w3c.tidy.Tidy
32

    
33
class Transana2TRS {
34
        boolean debug = false;
35
        boolean isDirectory = false;
36
        File outDir;
37
        File dtd;
38
        ConvertDocument converter;
39
        def monitor
40

    
41
        Transana2TRS(File outDir, File dtd, boolean debug, def monitor) {
42
                this.dtd = dtd;
43
                this.outDir = outDir
44
                this.monitor = monitor
45
                this.debug = debug
46
        }
47

    
48
        boolean process(File file) {
49
                try {
50
                        isDirectory = file.isDirectory();
51
                        if (isDirectory) {
52
                                return processDir(file)
53
                        } else {
54
                                return processFile(file, 100)
55
                        }
56
                } catch(Exception e) {
57
                        println "Error while processing: "+e
58
                        if (debug) e.printStackTrace()
59
                }
60
        }
61

    
62
        boolean processDir(File dir) {
63
                boolean ret = true
64
                try {
65
                        println "* Processing $dir directory"
66
                        converter = new ConvertDocument();
67
                        def files = dir.listFiles()
68
                        def okfiles = [];
69
                        if (files != null)
70
                                for (File file : files) {
71
                                        String fname = file.getName().toLowerCase()
72
                                        if ( file.isFile() &&
73
                                        (fname.endsWith(".odt") || fname.endsWith(".doc") || fname.endsWith(".rtf"))) {
74
                                                okfiles << file
75
                                        }
76

    
77
                                }
78

    
79
                        if (okfiles.size() == 0) {
80
                                println "No ODT/DOC/RTF file found in $dir"
81
                                return false
82
                        }
83

    
84
                        int delta = 100 / okfiles.size()
85
                        for (File file : okfiles) {
86
                                monitor.worked(0, "Processing file: $file")
87
                                if (!processFile(file, delta)) {
88
                                        ret = false;
89
                                        println "ERROR: Failed to process $file"
90
                                }
91

    
92
                                if (monitor.isCanceled()) {
93
                                        break; // stop looping
94
                                }
95
                        }
96
                } catch(Exception e) {
97
                        println "Error while processing directory: "+e;
98
                        if (debug) e.printStackTrace();
99
                }
100
                if (converter != null) converter.stop();
101
                return ret;
102
        }
103

    
104
        boolean processFile(File docFile, int work) {
105
                if (docFile == null) return false;
106
                int delta = work / 5
107
                println "** Processing $docFile file"
108
                String filename = docFile.getName()
109
                int idx = filename.indexOf(".")
110
                if (idx < 0) return false;
111
                String ext = filename.substring(idx)
112
                filename = filename.substring(0, idx)
113

    
114
                //File docFile = new File(srcDir, "${filename}$ext")
115
                File htmlFile = new File(outDir, "${filename}.html")
116
                File htmlFile2 = new File(outDir, "${filename}_temp.html")
117
                File xhtmlFile = new File(outDir, "${filename}.xhtml")
118
                File xmlFile = new File(outDir, "${filename}.xml")
119
                File trsFile = new File(outDir, "${filename}-tmp.trs")
120
                File finalTrsFile = new File(outDir, "${filename}.trs")
121

    
122
                monitor.worked(0, "DOCtoHTML: "+docFile.getName())
123

    
124
                boolean ret = DOCtoHTML(docFile, htmlFile)
125
                monitor.worked(delta, "HTMLtoHTMLforTidy: "+docFile.getName())
126
                if (monitor.isCanceled()) {
127
                        println "Process interrupted by user"
128
                        return false;
129
                }
130
                if (!ret) return ret;
131

    
132
                ret = ret && HTMLtoHTMLforTidy(htmlFile, htmlFile2)
133
                monitor.worked(delta, "HTMLtoXHTML: "+docFile.getName())
134
                if (monitor.isCanceled()) {
135
                        println "Process interrupted by user"
136
                        return false;
137
                }
138
                if (!ret) return ret;
139

    
140
                ret = ret && HTMLtoXHTML(htmlFile2, xhtmlFile)
141
                monitor.worked(delta, "XHTMLtoTRS: "+docFile.getName())
142
                if (monitor.isCanceled()) {
143
                        println "Process interrupted by user"
144
                        return false;
145
                }
146
                if (!ret) return ret;
147

    
148
                ret = ret && XHTMLtoTRS(xhtmlFile, trsFile)
149
                monitor.worked(delta, "TRStoFixedTRS: "+docFile.getName())
150
                if (monitor.isCanceled()) {
151
                        println "Process interrupted by user"
152
                        return false;
153
                }
154
                if (!ret) return ret;
155

    
156
                ret = ret && TRStoFixedTRS(trsFile, finalTrsFile);
157
                monitor.worked(delta, "Done: "+docFile.getName())
158
                if (monitor.isCanceled()) {
159
                        println "Process interrupted by user"
160
                        return false;
161
                }
162
                if (!ret) return ret;
163

    
164
                // cleaning
165
                System.gc()
166
                if (!debug) {
167
                        htmlFile.delete();htmlFile2.delete();xhtmlFile.delete();xmlFile.delete();trsFile.delete()
168
                }
169

    
170
                return ret
171
        }
172

    
173
        boolean DOCtoHTML(File docFile, File htmlFile) {
174
                println "*** ODT -> HTML"
175
                try {
176
                        if (!isDirectory) converter = new ConvertDocument();
177
                        converter.setDebug(debug)
178
                        converter.autoFile(docFile, htmlFile, "html")
179
                } catch(Exception e) {
180
                        println "Error while converting $docFile : $e"
181
                        if (debug) e.printStackTrace()
182
                } finally {
183
                        if (!isDirectory && converter != null) converter.stop()
184
                }
185
                return htmlFile.exists() && htmlFile.length() > 0
186
        }
187

    
188
        boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) {
189
                println "*** HTML -> HTML for tidy"
190
                try {
191
                        println "replace TABS with 4 spaces"
192
                        String text2 = htmlFile.getText("UTF-8")
193
                        text2 = text2.replaceAll("&nbsp;", " ")
194
                        text2 = text2.replaceAll("", "'")
195
                        text2 = text2.replaceAll("&rsquo;", "'")
196
                        text2 = text2.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
197
                        text2 = text2.replaceAll("\n", '&nbsp;')
198
                        text2 = text2.replaceAll("\r\n", '&nbsp;')
199
                        htmlFile2.withWriter("UTF-8") { writer ->
200
                                writer.write(text2);
201
                        }
202
                } catch(Exception e) {
203
                        println "Error while preparing HTML of $htmlFile : $e"
204
                        if (debug) e.printStackTrace()
205
                }
206
                return htmlFile2.exists() && htmlFile2.length() > 0
207
        }
208

    
209
        boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) {
210
                println "*** HTML for tidy -> XHTML"
211
                try {
212
                        Tidy tidy = new Tidy(); // obtain a new Tidy instance
213
                        tidy.setXHTML(true); // set desired config options using tidy setters
214
                        tidy.setInputEncoding("UTF-8")
215
                        tidy.setOutputEncoding("UTF-8")
216
                        tidy.setShowErrors(100)
217
                        tidy.setShowWarnings(debug)
218
                        tidy.setTabsize(10)
219
                        tidy.setWraplen(9999)
220
                        tidy.setForceOutput(true) // Tidy won't stop if error are found
221
                        xhtmlFile.withWriter("UTF-8") { out ->
222
                                def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8")
223
                                tidy.parse(input, out); // run tidy, providing an input and output stream
224
                        }
225
                        if (xhtmlFile.exists()) {
226
                                // JTidy produced a "0x0" char. removing them
227
                                // fix separated < and / ???
228
                                def c = Character.toChars(0)[0]
229
                                String txttmp = xhtmlFile.getText("UTF-8");
230
                                xhtmlFile.withWriter("UTF-8") { out ->
231
                                        out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</"))
232
                                }
233
                        }
234
                } catch(Exception e) {
235
                        println "Error while applying JTidy: "+e
236
                        if (debug) e.printStackTrace()
237
                }
238
                return xhtmlFile.exists() && xhtmlFile.length() > 0
239
        }
240

    
241
        boolean XHTMLtoTRS(File xhtmlFile, File trsFile) {
242
                println "*** XHTML -> TRS"
243
                try {
244
                        HTML2TRS ht = new HTML2TRS();
245
                        ht.process(xhtmlFile, trsFile);
246
                } catch(Exception e) {
247
                        println "Error while creating TRS file: "+e
248
                        if (debug) e.printStackTrace()
249
                }
250
                return trsFile.exists() && trsFile.length() > 0
251
        }
252

    
253
        boolean TRStoFixedTRS(File trsFile, File finalTrsFile) {
254
                println "*** TRS -> FIXED TRS"
255
                try {
256
                        def timingfixer = new FixTransanaTimings(trsFile, finalTrsFile);
257
                        if (!timingfixer.process()) {
258
                                println "Error while fixing timing"
259
                        }
260
                } catch(Exception e) {
261
                        println "Error while fixing TRS file: "+e
262
                        if (debug) e.printStackTrace()
263
                }
264
                return finalTrsFile.exists() && finalTrsFile.length() > 0
265
        }
266
}
267