Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / macro / transcription / Transana2TRS.groovy @ 499

History | View | Annotate | Download (8.4 kB)

1
// Copyright © 2010-2013 ENS de Lyon.
2
// Copyright © 2007-2010 ENS de Lyon, CNRS, INRP, University of
3
// Lyon 2, University of Franche-Comté, University of Nice
4
// Sophia Antipolis, University of Paris 3.
5
//
6
// The TXM platform is free software: you can redistribute it
7
// and/or modify it under the terms of the GNU General Public
8
// License as published by the Free Software Foundation,
9
// either version 2 of the License, or (at your option) any
10
// later version.
11
//
12
// The TXM platform is distributed in the hope that it will be
13
// useful, but WITHOUT ANY WARRANTY; without even the implied
14
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
15
// PURPOSE. See the GNU General Public License for more
16
// details.
17
//
18
// You should have received a copy of the GNU General
19
// Public License along with the TXM platform. If not, see
20
// http://www.gnu.org/licenses.
21

    
22
package org.txm.macro.transcription
23

    
24
import org.kohsuke.args4j.*
25

    
26
import groovy.transform.Field
27

    
28
import org.txm.rcp.swt.widget.parameters.*
29
import org.txm.utils.xml.DomUtils;
30
import org.w3c.tidy.Tidy
31
import org.txm.doc.*;
32

    
33
import javax.xml.stream.*;
34

    
35
import java.util.HashMap;
36
import java.util.List;
37

    
38
import org.txm.importer.*;
39
import org.xml.sax.Attributes;
40
import org.txm.importer.filters.*;
41
import org.txm.macro.transcription.FixTransanaTimings;
42
import org.txm.macro.transcription.HTML2TRS;
43

    
44
import java.io.File;
45
import java.io.IOException;
46
import java.util.ArrayList;
47

    
48
import javax.xml.parsers.SAXParserFactory;
49
import javax.xml.parsers.ParserConfigurationException;
50
import javax.xml.parsers.SAXParser;
51

    
52
import java.net.URL;
53

    
54
import org.xml.sax.InputSource;
55
import org.xml.sax.helpers.DefaultHandler;
56

    
57
import java.text.DecimalFormat;
58
import java.text.ParseException;
59
import java.text.SimpleDateFormat;
60
import java.util.Date;
61

    
62
class Transana2TRS {
63
        boolean debug = false;
64
        boolean isDirectory = false;
65
        File outDir;
66
        File dtd;
67
        ConvertDocument converter;
68
        def monitor
69

    
70
        Transana2TRS(File outDir, File dtd, boolean debug, def monitor) {
71
                this.dtd = dtd;
72
                this.outDir = outDir
73
                this.monitor = monitor
74
                this.debug = debug
75
        }
76

    
77
        boolean process(File file) {
78
                try {
79
                        isDirectory = file.isDirectory();
80
                        if (isDirectory) {
81
                                return processDir(file)
82
                        } else {
83
                                return processFile(file, 100)
84
                        }
85
                } catch(Exception e) {
86
                        println "Error while processing: "+e
87
                        if (debug) e.printStackTrace()
88
                }
89
        }
90

    
91
        boolean processDir(File dir) {
92
                boolean ret = true
93
                try {
94
                        println "* Processing $dir directory"
95
                        converter = new ConvertDocument();
96
                        def files = dir.listFiles()
97
                        def okfiles = [];
98
                        if (files != null)
99
                                for (File file : files) {
100
                                        String fname = file.getName().toLowerCase()
101
                                        if ( file.isFile() &&
102
                                        (fname.endsWith(".odt") || fname.endsWith(".doc") || fname.endsWith(".rtf"))) {
103
                                                okfiles << file
104
                                        }
105

    
106
                                }
107

    
108
                        if (okfiles.size() == 0) {
109
                                println "No ODT/DOC/RTF file found in $dir"
110
                                return false
111
                        }
112

    
113
                        int delta = 100 / okfiles.size()
114
                        for (File file : okfiles) {
115
                                monitor.worked(0, "Processing file: $file")
116
                                if (!processFile(file, delta)) {
117
                                        ret = false;
118
                                        println "ERROR: Failed to process $file"
119
                                }
120

    
121
                                if (monitor.isCanceled()) {
122
                                        break; // stop looping
123
                                }
124
                        }
125
                } catch(Exception e) {
126
                        println "Error while processing directory: "+e;
127
                        if (debug) e.printStackTrace();
128
                }
129
                if (converter != null) converter.stop();
130
                return ret;
131
        }
132

    
133
        boolean processFile(File docFile, int work) {
134
                if (docFile == null) return false;
135
                int delta = work / 5
136
                println "** Processing $docFile file"
137
                String filename = docFile.getName()
138
                int idx = filename.lastIndexOf(".")
139
                if (idx < 0) return false;
140
                String ext = filename.substring(idx)
141
                filename = filename.substring(0, idx)
142

    
143
                //File docFile = new File(srcDir, "${filename}$ext")
144
                File htmlFile = new File(outDir, "${filename}.html")
145
                File htmlFile2 = new File(outDir, "${filename}_temp.html")
146
                File xhtmlFile = new File(outDir, "${filename}.xhtml")
147
                File xmlFile = new File(outDir, "${filename}.xml")
148
                File trsFile = new File(outDir, "${filename}-tmp.trs")
149
                File finalTrsFile = new File(outDir, "${filename}.trs")
150

    
151
                monitor.worked(0, "DOCtoHTML: "+docFile.getName())
152

    
153
                boolean ret = DOCtoHTML(docFile, htmlFile)
154
                monitor.worked(delta, "HTMLtoHTMLforTidy: "+docFile.getName())
155
                if (monitor.isCanceled()) {
156
                        println "Process interrupted by user"
157
                        return false;
158
                }
159
                if (!ret) return ret;
160

    
161
                ret = ret && HTMLtoHTMLforTidy(htmlFile, htmlFile2)
162
                monitor.worked(delta, "HTMLtoXHTML: "+docFile.getName())
163
                if (monitor.isCanceled()) {
164
                        println "Process interrupted by user"
165
                        return false;
166
                }
167
                if (!ret) return ret;
168

    
169
                ret = ret && HTMLtoXHTML(htmlFile2, xhtmlFile)
170
                monitor.worked(delta, "XHTMLtoTRS: "+docFile.getName())
171
                if (monitor.isCanceled()) {
172
                        println "Process interrupted by user"
173
                        return false;
174
                }
175
                if (!ret) return ret;
176

    
177
                ret = ret && XHTMLtoTRS(xhtmlFile, trsFile)
178
                monitor.worked(delta, "TRStoFixedTRS: "+docFile.getName())
179
                if (monitor.isCanceled()) {
180
                        println "Process interrupted by user"
181
                        return false;
182
                }
183
                if (!ret) return ret;
184

    
185
                ret = ret && TRStoFixedTRS(trsFile, finalTrsFile);
186
                monitor.worked(delta, "Done: "+docFile.getName())
187
                if (monitor.isCanceled()) {
188
                        println "Process interrupted by user"
189
                        return false;
190
                }
191
                if (!ret) return ret;
192

    
193
                // cleaning
194
                System.gc()
195
                if (!debug) {
196
                        htmlFile.delete();htmlFile2.delete();xhtmlFile.delete();xmlFile.delete();trsFile.delete()
197
                }
198

    
199
                return ret
200
        }
201

    
202
        boolean DOCtoHTML(File docFile, File htmlFile) {
203
                println "*** ODT -> HTML"
204
                try {
205
                        if (!isDirectory) converter = new ConvertDocument();
206
                        converter.setDebug(debug)
207
                        converter.autoFile(docFile, htmlFile, "html")
208
                } catch(Exception e) {
209
                        println "Error while converting $docFile : $e"
210
                        if (debug) e.printStackTrace()
211
                } finally {
212
                        if (!isDirectory && converter != null) converter.stop()
213
                }
214
                return htmlFile.exists() && htmlFile.length() > 0
215
        }
216

    
217
        boolean HTMLtoHTMLforTidy(File htmlFile, File htmlFile2) {
218
                println "*** HTML -> HTML for tidy"
219
                try {
220
                        println "replace TABS with 4 spaces"
221
                        String text2 = htmlFile.getText("UTF-8")
222
                        text2 = text2.replaceAll("&nbsp;", " ")
223
                        text2 = text2.replaceAll("", "'")
224
                        text2 = text2.replaceAll("&rsquo;", "'")
225
                        text2 = text2.replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
226
                        text2 = text2.replaceAll("\n", '&nbsp;')
227
                        text2 = text2.replaceAll("\r\n", '&nbsp;')
228
                        htmlFile2.withWriter("UTF-8") { writer ->
229
                                writer.write(text2);
230
                        }
231
                } catch(Exception e) {
232
                        println "Error while preparing HTML of $htmlFile : $e"
233
                        if (debug) e.printStackTrace()
234
                }
235
                return htmlFile2.exists() && htmlFile2.length() > 0
236
        }
237

    
238
        boolean HTMLtoXHTML(File htmlFile2, File xhtmlFile) {
239
                println "*** HTML for tidy -> XHTML"
240
                try {
241
                        Tidy tidy = new Tidy(); // obtain a new Tidy instance
242
                        tidy.setXHTML(true); // set desired config options using tidy setters
243
                        tidy.setInputEncoding("UTF-8")
244
                        tidy.setOutputEncoding("UTF-8")
245
                        tidy.setShowErrors(100)
246
                        tidy.setShowWarnings(debug)
247
                        tidy.setTabsize(10)
248
                        tidy.setWraplen(9999)
249
                        tidy.setForceOutput(true) // Tidy won't stop if error are found
250
                        xhtmlFile.withWriter("UTF-8") { out ->
251
                                def input = new InputStreamReader(htmlFile2.toURI().toURL().newInputStream(), "UTF-8")
252
                                tidy.parse(input, out); // run tidy, providing an input and output stream
253
                        }
254
                        if (xhtmlFile.exists()) {
255
                                // JTidy produced a "0x0" char. removing them
256
                                // fix separated < and / ???
257
                                def c = Character.toChars(0)[0]
258
                                String txttmp = xhtmlFile.getText("UTF-8");
259
                                xhtmlFile.withWriter("UTF-8") { out ->
260
                                        out.write(txttmp.replace("<\n/", "</").replace("<\r\n/", "</"))
261
                                }
262
                        }
263
                } catch(Exception e) {
264
                        println "Error while applying JTidy: "+e
265
                        if (debug) e.printStackTrace()
266
                }
267
                return xhtmlFile.exists() && xhtmlFile.length() > 0
268
        }
269

    
270
        boolean XHTMLtoTRS(File xhtmlFile, File trsFile) {
271
                println "*** XHTML -> TRS"
272
                try {
273
                        HTML2TRS ht = new HTML2TRS();
274
                        ht.process(xhtmlFile, trsFile);
275
                } catch(Exception e) {
276
                        println "Error while creating TRS file: "+e
277
                        if (debug) e.printStackTrace()
278
                }
279
                return trsFile.exists() && trsFile.length() > 0
280
        }
281

    
282
        boolean TRStoFixedTRS(File trsFile, File finalTrsFile) {
283
                println "*** TRS -> FIXED TRS"
284
                try {
285
                        def timingfixer = new FixTransanaTimings(trsFile, finalTrsFile);
286
                        if (!timingfixer.process()) {
287
                                println "Error while fixing timing"
288
                        }
289
                } catch(Exception e) {
290
                        println "Error while fixing TRS file: "+e
291
                        if (debug) e.printStackTrace()
292
                }
293
                return finalTrsFile.exists() && finalTrsFile.length() > 0
294
        }
295
}
296