Statistics
| Revision:

ccc / projets / CMC2ELAN / src / TEICMC2TRS.groovy @ 2

History | View | Annotate | Download (5 kB)

1

    
2

    
3
import java.io.File;
4
import java.net.URL;
5

    
6
import javax.xml.stream.*;
7
import java.text.SimpleDateFormat;
8
import java.util.Date;
9

    
10
public class TEICMC2TRS extends StaxIdentityParser {
11

    
12
        File outputDirectory;
13
        String loc;
14

    
15
        public TEICMC2TRS(File inputFile, String loc) {
16
                super(inputFile);
17
                this.loc = loc;
18
        }
19
        
20
        boolean inBody = false
21
        boolean inName = false;
22
        String filenameValue = "";
23
        boolean inDate = false;
24
        String dateValue = "";
25
        boolean inTime = false;
26
        String timeValue = "";
27
        boolean inPosting = false;
28
        String postingID = ""
29
        String postingWho = ""
30
        boolean inP;
31
        String content = ""
32

    
33
        protected void processStartElement() {
34
                if (parser.getLocalName() == "body") { // write TRS HEADER
35
                        inBody = true
36
                        writer.writeStartElement("Trans");
37
                        writer.writeAttribute("version","10");
38
                        writer.writeAttribute("audio_filename","");
39
                        writer.writeAttribute("scribe","");
40
                        writer.writeAttribute("version_date","");
41
                        
42
                        writer.writeStartElement("Topics");
43
                        writer.writeEndElement(); //Topics
44
                        writer.writeStartElement("Speakers");
45
                        writer.writeStartElement("Speaker");
46
                        writer.writeAttribute("id",loc);
47
                        writer.writeAttribute("name",loc);
48
                        writer.writeAttribute("check","");
49
                        writer.writeAttribute("dialect","");
50
                        writer.writeAttribute("scope","");
51
                        writer.writeEndElement(); //Speaker
52
                        writer.writeEndElement(); //Speakers
53
                        
54
                        writer.writeStartElement("Episode");
55
                        writer.writeStartElement("Section");
56
                        writer.writeAttribute("type","report");
57
                        writer.writeAttribute("startTime","");
58
                        writer.writeAttribute("endTime","");
59
                        
60
                } else if (parser.getLocalName() == "posting") {
61
                        inPosting = true
62
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
63
                                if (parser.getAttributeLocalName(i) == "who") {
64
                                        postingWho = parser.getAttributeValue(i)
65
                                } else if (parser.getAttributeLocalName(i) == "type") {
66
                                        postingID = parser.getAttributeValue(i)
67
                                }
68
                        }
69
                        return;
70
                } else if(inPosting) {
71
                        if ( parser.getLocalName() == "dateline") {
72
                        } else if (parser.getLocalName() == "name") {
73
                                inName = true
74
                                filenameValue = ""
75
                        } else if (parser.getLocalName() == "date") {
76
                                inDate = true;
77
                                dateValue = ""
78
                        } else if (parser.getLocalName() == "time") {
79
                                inTime = true;
80
                                timeValue = ""
81
                        } else if (parser.getLocalName() == "p") {
82
                                inP = true;
83
                                content = ""
84
                        }
85
                        return;
86
                }
87
        }
88

    
89
        protected void processCharacters() {
90
                if (inPosting) {
91
                        if (inName) {
92
                                filenameValue += parser.getText();
93
                                return;
94
                        } else if (inDate) {
95
                                dateValue += parser.getText();
96
                                return;
97
                        } else if (inTime) {
98
                                timeValue += parser.getText();
99
                                return;
100
                        } else if (inP) {
101
                                content += parser.getText();
102
                                return;
103
                        }
104
                }
105

    
106
        }
107

    
108
        SimpleDateFormat formatter = new SimpleDateFormat("HHmmss");
109
        Date from = null// use only the first from
110
        protected void processEndElement()
111
        {
112
                if (parser.getLocalName() == "body") { // write TRS HEADER
113
                        writer.writeEndElement(); //Section
114
                        writer.writeEndElement(); //Episode
115
                        writer.writeEndElement(); //Trans
116
                } else if (inPosting) {
117
                        if (inPosting && parser.getLocalName() == "name") {
118
                                inName = false;
119
                        } else if (parser.getLocalName() == "date") {
120
                                inDate = false;
121
                        } else if (parser.getLocalName() == "p") {
122
                                inP = false;
123
                        } else if (parser.getLocalName() == "time") {
124
                                inTime = false;
125
                        } else if (parser.getLocalName() == "posting") {
126
                                if (writer == null) {
127
                                        println "Error Houston !!! with loc=$postingWho date=$dateValue at="parser.getLocation()
128
                                        return;
129
                                }
130
                                // write Turn
131
                                //println "write Turn"
132
                                
133
                                if (from == null) {
134
                                        int i1 = filenameValue.indexOf(".flv")
135
                                        int i2 = i1-6
136
                                        String date1 = filenameValue.substring(i2, i1)
137
                                        from = formatter.parse(date1);
138
                                }
139
                                
140
                                Date to = formatter.parse(timeValue.replace(":", ""));
141
                                float diff = ((float)((to.getTime() - from.getTime()))) / 1000.0f // ms
142
                                float diff2 = diff + 10
143
                                writer.writeStartElement("Turn");
144
                                writer.writeAttribute("startTime","$diff");
145
                                writer.writeAttribute("endTime","$diff2");
146
                                writer.writeAttribute("speaker",postingWho);
147
                                writer.writeStartElement("Sync");
148
                                writer.writeAttribute("time","$diff");
149
                                writer.writeEndElement(); //Sync
150
                                writer.writeCharacters(content)
151
                                writer.writeEndElement(); //Turn
152
                                writer.writeCharacters("\n")
153
                                //println "end write Turn"
154
                                inPosting = false
155
                        }
156
                        return;
157
                }
158

    
159
        }
160

    
161
        public static void main(String[] args) {
162
                File inputFile = new File("/home/mdecorde/xml/comere/split_out/tt_5-2013-11-12-teicmr.xml")
163
                File outputFile = new File("/home/mdecorde/xml/comere/split_out/tt_5-2013-11-12-teicmr-broken.trs")
164
                File outputFixedFile = new File("/home/mdecorde/xml/comere/split_out/tt_5-2013-11-12-teicmr.trs")
165
                
166
                
167
                XPathResult xpathProcessor = new XPathResult(inputFile);
168
                def loc = xpathProcessor.getXpathResponse("//tei:posting/@who")
169

    
170
                def p1 = new TEICMC2TRS(inputFile, loc)                
171
                def p2 = new FixMissingTimings(outputFile, outputFixedFile)
172
                
173
                if (p1 && p2) {
174
                        outputFile.delete()
175
                } else {
176
                        println "Failed to convert $inputFile to TRS"
177
                }
178
        }
179
}