Statistics
| Revision:

ccc / projets / CMC2ELAN / src / TEICMC2TRS.groovy @ 2

History | View | Annotate | Download (5 kB)

1 2 mdecorde
2 2 mdecorde
3 2 mdecorde
import java.io.File;
4 2 mdecorde
import java.net.URL;
5 2 mdecorde
6 2 mdecorde
import javax.xml.stream.*;
7 2 mdecorde
import java.text.SimpleDateFormat;
8 2 mdecorde
import java.util.Date;
9 2 mdecorde
10 2 mdecorde
public class TEICMC2TRS extends StaxIdentityParser {
11 2 mdecorde
12 2 mdecorde
        File outputDirectory;
13 2 mdecorde
        String loc;
14 2 mdecorde
15 2 mdecorde
        public TEICMC2TRS(File inputFile, String loc) {
16 2 mdecorde
                super(inputFile);
17 2 mdecorde
                this.loc = loc;
18 2 mdecorde
        }
19 2 mdecorde
20 2 mdecorde
        boolean inBody = false
21 2 mdecorde
        boolean inName = false;
22 2 mdecorde
        String filenameValue = "";
23 2 mdecorde
        boolean inDate = false;
24 2 mdecorde
        String dateValue = "";
25 2 mdecorde
        boolean inTime = false;
26 2 mdecorde
        String timeValue = "";
27 2 mdecorde
        boolean inPosting = false;
28 2 mdecorde
        String postingID = ""
29 2 mdecorde
        String postingWho = ""
30 2 mdecorde
        boolean inP;
31 2 mdecorde
        String content = ""
32 2 mdecorde
33 2 mdecorde
        protected void processStartElement() {
34 2 mdecorde
                if (parser.getLocalName() == "body") { // write TRS HEADER
35 2 mdecorde
                        inBody = true
36 2 mdecorde
                        writer.writeStartElement("Trans");
37 2 mdecorde
                        writer.writeAttribute("version","10");
38 2 mdecorde
                        writer.writeAttribute("audio_filename","");
39 2 mdecorde
                        writer.writeAttribute("scribe","");
40 2 mdecorde
                        writer.writeAttribute("version_date","");
41 2 mdecorde
42 2 mdecorde
                        writer.writeStartElement("Topics");
43 2 mdecorde
                        writer.writeEndElement(); //Topics
44 2 mdecorde
                        writer.writeStartElement("Speakers");
45 2 mdecorde
                        writer.writeStartElement("Speaker");
46 2 mdecorde
                        writer.writeAttribute("id",loc);
47 2 mdecorde
                        writer.writeAttribute("name",loc);
48 2 mdecorde
                        writer.writeAttribute("check","");
49 2 mdecorde
                        writer.writeAttribute("dialect","");
50 2 mdecorde
                        writer.writeAttribute("scope","");
51 2 mdecorde
                        writer.writeEndElement(); //Speaker
52 2 mdecorde
                        writer.writeEndElement(); //Speakers
53 2 mdecorde
54 2 mdecorde
                        writer.writeStartElement("Episode");
55 2 mdecorde
                        writer.writeStartElement("Section");
56 2 mdecorde
                        writer.writeAttribute("type","report");
57 2 mdecorde
                        writer.writeAttribute("startTime","");
58 2 mdecorde
                        writer.writeAttribute("endTime","");
59 2 mdecorde
60 2 mdecorde
                } else if (parser.getLocalName() == "posting") {
61 2 mdecorde
                        inPosting = true
62 2 mdecorde
                        for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
63 2 mdecorde
                                if (parser.getAttributeLocalName(i) == "who") {
64 2 mdecorde
                                        postingWho = parser.getAttributeValue(i)
65 2 mdecorde
                                } else if (parser.getAttributeLocalName(i) == "type") {
66 2 mdecorde
                                        postingID = parser.getAttributeValue(i)
67 2 mdecorde
                                }
68 2 mdecorde
                        }
69 2 mdecorde
                        return;
70 2 mdecorde
                } else if(inPosting) {
71 2 mdecorde
                        if ( parser.getLocalName() == "dateline") {
72 2 mdecorde
                        } else if (parser.getLocalName() == "name") {
73 2 mdecorde
                                inName = true
74 2 mdecorde
                                filenameValue = ""
75 2 mdecorde
                        } else if (parser.getLocalName() == "date") {
76 2 mdecorde
                                inDate = true;
77 2 mdecorde
                                dateValue = ""
78 2 mdecorde
                        } else if (parser.getLocalName() == "time") {
79 2 mdecorde
                                inTime = true;
80 2 mdecorde
                                timeValue = ""
81 2 mdecorde
                        } else if (parser.getLocalName() == "p") {
82 2 mdecorde
                                inP = true;
83 2 mdecorde
                                content = ""
84 2 mdecorde
                        }
85 2 mdecorde
                        return;
86 2 mdecorde
                }
87 2 mdecorde
        }
88 2 mdecorde
89 2 mdecorde
        protected void processCharacters() {
90 2 mdecorde
                if (inPosting) {
91 2 mdecorde
                        if (inName) {
92 2 mdecorde
                                filenameValue += parser.getText();
93 2 mdecorde
                                return;
94 2 mdecorde
                        } else if (inDate) {
95 2 mdecorde
                                dateValue += parser.getText();
96 2 mdecorde
                                return;
97 2 mdecorde
                        } else if (inTime) {
98 2 mdecorde
                                timeValue += parser.getText();
99 2 mdecorde
                                return;
100 2 mdecorde
                        } else if (inP) {
101 2 mdecorde
                                content += parser.getText();
102 2 mdecorde
                                return;
103 2 mdecorde
                        }
104 2 mdecorde
                }
105 2 mdecorde
106 2 mdecorde
        }
107 2 mdecorde
108 2 mdecorde
        SimpleDateFormat formatter = new SimpleDateFormat("HHmmss");
109 2 mdecorde
        Date from = null// use only the first from
110 2 mdecorde
        protected void processEndElement()
111 2 mdecorde
        {
112 2 mdecorde
                if (parser.getLocalName() == "body") { // write TRS HEADER
113 2 mdecorde
                        writer.writeEndElement(); //Section
114 2 mdecorde
                        writer.writeEndElement(); //Episode
115 2 mdecorde
                        writer.writeEndElement(); //Trans
116 2 mdecorde
                } else if (inPosting) {
117 2 mdecorde
                        if (inPosting && parser.getLocalName() == "name") {
118 2 mdecorde
                                inName = false;
119 2 mdecorde
                        } else if (parser.getLocalName() == "date") {
120 2 mdecorde
                                inDate = false;
121 2 mdecorde
                        } else if (parser.getLocalName() == "p") {
122 2 mdecorde
                                inP = false;
123 2 mdecorde
                        } else if (parser.getLocalName() == "time") {
124 2 mdecorde
                                inTime = false;
125 2 mdecorde
                        } else if (parser.getLocalName() == "posting") {
126 2 mdecorde
                                if (writer == null) {
127 2 mdecorde
                                        println "Error Houston !!! with loc=$postingWho date=$dateValue at="parser.getLocation()
128 2 mdecorde
                                        return;
129 2 mdecorde
                                }
130 2 mdecorde
                                // write Turn
131 2 mdecorde
                                //println "write Turn"
132 2 mdecorde
133 2 mdecorde
                                if (from == null) {
134 2 mdecorde
                                        int i1 = filenameValue.indexOf(".flv")
135 2 mdecorde
                                        int i2 = i1-6
136 2 mdecorde
                                        String date1 = filenameValue.substring(i2, i1)
137 2 mdecorde
                                        from = formatter.parse(date1);
138 2 mdecorde
                                }
139 2 mdecorde
140 2 mdecorde
                                Date to = formatter.parse(timeValue.replace(":", ""));
141 2 mdecorde
                                float diff = ((float)((to.getTime() - from.getTime()))) / 1000.0f // ms
142 2 mdecorde
                                float diff2 = diff + 10
143 2 mdecorde
                                writer.writeStartElement("Turn");
144 2 mdecorde
                                writer.writeAttribute("startTime","$diff");
145 2 mdecorde
                                writer.writeAttribute("endTime","$diff2");
146 2 mdecorde
                                writer.writeAttribute("speaker",postingWho);
147 2 mdecorde
                                writer.writeStartElement("Sync");
148 2 mdecorde
                                writer.writeAttribute("time","$diff");
149 2 mdecorde
                                writer.writeEndElement(); //Sync
150 2 mdecorde
                                writer.writeCharacters(content)
151 2 mdecorde
                                writer.writeEndElement(); //Turn
152 2 mdecorde
                                writer.writeCharacters("\n")
153 2 mdecorde
                                //println "end write Turn"
154 2 mdecorde
                                inPosting = false
155 2 mdecorde
                        }
156 2 mdecorde
                        return;
157 2 mdecorde
                }
158 2 mdecorde
159 2 mdecorde
        }
160 2 mdecorde
161 2 mdecorde
        public static void main(String[] args) {
162 2 mdecorde
                File inputFile = new File("/home/mdecorde/xml/comere/split_out/tt_5-2013-11-12-teicmr.xml")
163 2 mdecorde
                File outputFile = new File("/home/mdecorde/xml/comere/split_out/tt_5-2013-11-12-teicmr-broken.trs")
164 2 mdecorde
                File outputFixedFile = new File("/home/mdecorde/xml/comere/split_out/tt_5-2013-11-12-teicmr.trs")
165 2 mdecorde
166 2 mdecorde
167 2 mdecorde
                XPathResult xpathProcessor = new XPathResult(inputFile);
168 2 mdecorde
                def loc = xpathProcessor.getXpathResponse("//tei:posting/@who")
169 2 mdecorde
170 2 mdecorde
                def p1 = new TEICMC2TRS(inputFile, loc)
171 2 mdecorde
                def p2 = new FixMissingTimings(outputFile, outputFixedFile)
172 2 mdecorde
173 2 mdecorde
                if (p1 && p2) {
174 2 mdecorde
                        outputFile.delete()
175 2 mdecorde
                } else {
176 2 mdecorde
                        println "Failed to convert $inputFile to TRS"
177 2 mdecorde
                }
178 2 mdecorde
        }
179 2 mdecorde
}