Statistics
| Revision:

root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / AddSections.groovy @ 2369

History | View | Annotate | Download (7 kB)

1
import java.nio.charset.Charset
2

    
3
import java.time.LocalTime
4
import java.time.format.DateTimeFormatter
5
import org.txm.utils.*
6

    
7
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/sujets.xlsx")
8
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs")
9

    
10
def joinTRSColumn = "Lien notice principale"
11
def typeColumns = ["Titre propre"]
12
def topicColumns = ["Date de diffusion"]
13
def startTimeColumn = "antract_debut"
14
def endTimeColumn = "antract_fin"
15
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"]
16
def metadataColumns = [
17
                                "Titre propre", "Date de diffusion", "Identifiant de la notice", "Notes du titre", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique",
18
                                "Nom fichier segmenté (info)", "antract_video", "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date",
19
                                "Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)"]
20
def metadataColumnsGroups = [
21
        "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata",
22
        "secondary", "secondary", "secondary","secondary","secondary","secondary","secondary",
23
        "text", "text", "text", "text"]
24

    
25
if (metadataColumns.size() != metadataColumnsGroups.size()) {
26
        println "ERROR in metadata declarations&groups:"
27
        println "COLUMNS: "+metadataColumns
28
        println "GROUPS : "+metadataColumnsGroups
29
        return
30
}
31

    
32
if (!trsDirectory.exists()) {
33
        println "$trsDirectory not found"
34
        return
35
}
36

    
37
println "Loading data from $metadataFile..."
38
TableReader reader = new TableReader(metadataFile)//, "\t".charAt(0), Charset.forName("UTF-8")
39
reader.readHeaders()
40
def header = reader.getHeaders()
41
if (!header.contains(joinTRSColumn)) {
42
        println "No TRS ID $joinTRSColumn column found"
43
        return
44
}
45
if (!header.contains(startTimeColumn)) {
46
        println "No start time $startTimeColumn column found"
47
        return
48
}
49
if (!header.contains(endTimeColumn)) {
50
        println "No end time $endTimeColumn column found"
51
        return
52
}
53
for (def col : metadataColumns) {
54
        if (!header.contains(endTimeColumn)) {
55
                println "No $col column found"
56
                return
57
        }
58
}
59
for (def col : typeColumns) {
60
        if (!header.contains(endTimeColumn)) {
61
                println "No type $col column found"
62
                return
63
        }
64
}
65
for (def col : topicColumns) {
66
        if (!header.contains(endTimeColumn)) {
67
                println "No topic $col column found"
68
                return
69
        }
70
}
71

    
72
File outputDirectory = new File(trsDirectory, "out")
73
println "Writing result to $outputDirectory..."
74

    
75
dateTimeFormatter = DateTimeFormatter.ISO_LOCAL_TIME
76
def strTotime(def str) {
77
        if (str.lastIndexOf(":") == -1) {
78
                return null
79
        }
80
        
81
        bonusFrame = Integer.parseInt(str.substring(str.lastIndexOf(":")+1))
82
        //if (str.contains("135475")) println "ERROR $str in $infos -> $bonusFrame"
83
        if (bonusFrame > 25) {
84
                bonusFrame=0;
85
        }
86
        totalFrame = str.substring(0, str.lastIndexOf(":"))
87
        
88
        LocalTime time1 = LocalTime.parse(totalFrame, dateTimeFormatter)
89
        totalFrame = (time1.getHour()*60*60) + (time1.getMinute()*60) + time1.getSecond()
90

    
91
                def ret = totalFrame + (bonusFrame/25)
92
                return ret
93
        }
94

    
95
def sectionGroupsToInsert = [:]
96
println "Reading data..."
97
while (reader.readRecord()) {
98
        String id = reader.get(joinTRSColumn).trim()
99
        if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4)
100
        if (id.length() == 0) continue;
101
                
102
        if (!sectionGroupsToInsert.containsKey(id)) {
103
                sectionGroupsToInsert[id] = []
104
        }
105
        def section = sectionGroupsToInsert[id]
106
        
107
        if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections
108
                
109
                def m = [:]
110
                
111
                for (def todo : ["topic":topicColumns, "type":typeColumns]) {
112
                        def data = []
113
                        for (def col : todo.value) {
114
                                if (reader.get(col).trim().length() > 0) {
115
                                        data << reader.get(col).trim().replace("\n", "")
116
                                }
117
                        }
118
                        m[todo.key] = data.join("\t")
119
                }
120
                def metadataList = []
121
                def metadataGroupList = []
122
                for (int i = 0 ;  i < metadataColumns.size() ; i++) {
123
                        def col = metadataColumns[i]
124
                        String c = AsciiUtils.buildAttributeId(col)
125
                        m[c] = reader.get(col)
126
                        metadataList << c
127
                        metadataGroupList << metadataColumnsGroups[i]
128
                }
129
                m["metadata"] = metadataList.join("|") 
130
                m["metadata_groups"] = metadataGroupList.join("|")
131
                
132
                m["startTime"] = strTotime(reader.get(startTimeColumn))
133
                m["endTime"] = strTotime(reader.get(endTimeColumn))
134
                m["synchronized"] = "true"
135
                                
136
                section << [m["startTime"], m["endTime"], m]
137
        }
138
}
139

    
140
println "Inserting sections... "+sectionGroupsToInsert.size()
141

    
142
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size())
143
for (String id : sectionGroupsToInsert.keySet()) {
144
        cpb.tick()
145
        File trsFile = new File(trsDirectory, id+".trs")
146
        if (!trsFile.exists()) {
147
                continue
148
        }
149
        //println "Processing $id..."
150
        def sections = sectionGroupsToInsert[id]
151
        sections = sections.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] }
152
        
153
        // Open input file
154
        def slurper = new XmlParser();
155
        slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
156
                
157
        def trs = slurper.parse(trsFile.toURI().toString())
158
        def trsEpisodes = trs.Episode // 1
159
        if (trsEpisodes.size() > 1) {
160
                println "multiple Episode node in $trsFile"
161
                continue
162
        }
163
        def trsEpisode = trsEpisodes[0]
164
        def trsSections =  trs.Episode.Section // 1
165
        if (trsSections.size() > 1) {
166
                println "multiple Section node in $trsFile"
167
                continue
168
        }
169
        def trsSection = trsSections[0]
170
        
171
        def turns = trsSection.Turn
172
        def newSections = []
173
        def iSection = 0;
174
        def currentSection = null
175
        def currentNode = null
176

    
177
        for (def turn : turns) {
178
                def start = Float.parseFloat(turn.@startTime)
179
                def end = Float.parseFloat(turn.@endTime)
180
                
181
                def found = null;
182
                for (int i = iSection ; i < sections.size() ; i++) {
183
                        if (end < sections[i][0]) { // Turn is before section
184
                                
185
                        } else if (sections[i][1] < start) { // Turn is before section
186
                                
187
                        } else {
188
                                found = sections[i]
189
                                iSection = i
190
                                break; // stop searching and set iSection to accelerate next search
191
                        }
192
                }
193
                
194
                if (found == null) {
195
                        if (currentSection != null || currentNode == null) {
196
                                currentNode = new Node(trsEpisode, "Section", ["type":"Sujet non synchronisé", "topic":"none", "startTime":turn.@startTime, "endTime":"", "synchronized":"false"] )
197
                                currentSection = null;
198
                        }
199
                } else {
200
                        if (found != currentSection) {
201
                                if (currentNode != null && currentNode.@synchronized == "false") {
202
                                        def tmp = currentNode.Turn
203
                                        currentNode.@endTime = tmp[-1].@endTime
204
                                }
205
                                
206
                                currentSection = found
207
                                currentNode = new Node(trsEpisode, "Section", currentSection[2])
208
                        }
209
                }
210
                trsSection.remove(turn)
211
                currentNode.append(turn)
212
        }
213
        
214
        //remove the initial section
215
        trsEpisode.remove(trsSection)
216
        
217
        outputDirectory.mkdir()
218
        File outfile = new File(outputDirectory, trsFile.getName())
219
        outfile.withWriter("UTF-8") { writer ->
220
                writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
221
                def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer))
222
                printer.setPreserveWhitespace(true)
223
                printer.print(trs)
224
        }
225
}
226
cpb.done()
227
reader.close()
228
println "Done."