Revision 479 tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/TXT2TRS.groovy

TXT2TRS.groovy (revision 479)
18 18
// You should have received a copy of the GNU General
19 19
// Public License along with the TXM platform. If not, see
20 20
// http://www.gnu.org/licenses.
21

  
22 21
package org.txm.macro.transcription
23 22

  
24
import java.text.DecimalFormat
25
import java.text.SimpleDateFormat
23
import org.txm.utils.xml.DomUtils;
24
import org.w3c.tidy.Tidy
25
import org.txm.doc.*
26 26

  
27 27
import javax.xml.stream.*
28 28

  
29
import org.txm.doc.*
29
import java.util.HashMap
30
import java.util.List
31

  
30 32
import org.txm.importer.*
33
import org.xml.sax.Attributes
31 34
import org.txm.importer.filters.*
35
import org.txm.utils.io.IOUtils;
36
import org.txm.utils.i18n.DetectBOM
32 37

  
38
import java.io.File
39
import java.io.IOException
40
import java.util.ArrayList
41

  
42
import javax.xml.parsers.SAXParserFactory
43
import javax.xml.parsers.ParserConfigurationException
44
import javax.xml.parsers.SAXParser
45

  
46
import java.net.URL
47

  
48
import org.xml.sax.InputSource
49
import org.xml.sax.helpers.DefaultHandler
50

  
51
import java.text.DecimalFormat
52
import java.text.ParseException
53
import java.text.SimpleDateFormat
54
import java.util.Date;
55

  
56
/**
57
 * Parse a formatted a TXT file to create a TRS file.
58
 * 
59
 * Manage turns, syncs, locutors, section.
60
 * 
61
 * The parsing of the TXT file starts at the first time code
62
 *  
63
 * @author mdecorde, sheiden
64
 *
65
 */
33 66
class TXT2TRS {
34 67

  
35 68
	boolean inTurn = false
......
49 82
	boolean isThemeOpened = false
50 83

  
51 84
	def formater
85
	def formater_without_ms
52 86
	def formater2
53 87
	StaxStackWriter pagedWriter
54 88
	
55 89
	
56 90
	//                 /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>)/
57
	def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>)/
91
	def bullet_regex = /\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9][0-9]?)?\)|(¤<[0-9]+>)/
58 92
	def turn_with_bullet = /.+(\([0-9]+:[0-9][0-9]?:[0-9][0-9]?(\.[0-9])?\)|(¤<[0-9]+>))/
59 93
	//                  /([a-zA-Z]+)="([^"]+)"/
60 94
	def section_regex = /([a-zA-Z]+)="([^"]+)"/
......
62 96
	public TXT2TRS() { }
63 97

  
64 98
	public boolean process(File txtFile, File trsFile) {
99
//
100
		FileInputStream input = new FileInputStream(txtFile);
101
		Reader reader = new InputStreamReader(input , "UTF-8");
102
		
103
		for (int i = 0 ; i < new DetectBOM(txtFile).getBOMSize() ; i++) input.read();
104
		def lines = reader.readLines();
65 105

  
66
		def lines = txtFile.readLines("UTF-8")
67

  
68 106
		formater = new SimpleDateFormat("h:mm:ss.S");
107
		formater_without_ms = new SimpleDateFormat("h:mm:ss");
69 108
		formater.setTimeZone(TimeZone.getTimeZone("GMT"));
109
		formater_without_ms.setTimeZone(TimeZone.getTimeZone("GMT"));
70 110
		formater2 = new DecimalFormat("#######.0")
71 111
		formater2.setMaximumFractionDigits(2)
72 112

  
......
93 133
		boolean started = false
94 134
		for (String line : lines) {
95 135
			line = line.trim()
96
			if (line.matches("\\(.+:..:..\\..+\\)")) {
136
			if (line.matches(bullet_regex)) {
97 137
				started = true;
98 138
			}
99 139

  
......
107 147
			return false;
108 148
		}
109 149
		String lastLine = trslines[-1]
110
		if (!lastLine.matches("\\(.+:..:..\\..+\\)")) {
150
		if (!lastLine.matches(bullet_regex)) {
111 151
			// check if last turn has a bullet at the end
112 152
			boolean bulletIsMissing = true;
113 153
			for (int i = trslines.size() - 1 ; i > 0 ; i--) {
......
337 377
			pagedWriter.writeAttribute("time", time.substring(0,time.length()-3)+"."+time.substring(time.length()-3));
338 378
		} else {
339 379
			String str = txt.substring(1, txt.length()-1)
340
			Date date = formater.parse(str);
380
			Date date = null;
381
			try {date = formater.parse(str);} catch(Exception e1){}
382
			try {date = formater_without_ms.parse(str);} catch(Exception e2){println "Failed to parse time: "+str; return;}
341 383
			pagedWriter.writeAttribute("time", ""+(date.getTime()/1000.0f));
342 384
		}
343 385
		pagedWriter.writeCharacters("\n")

Also available in: Unified diff