root / tmp / org.txm.groovy.core / src / groovy / org / txm / scripts / importer / transcriber / AddSections.groovy @ 2369
History | View | Annotate | Download (7 kB)
1 |
import java.nio.charset.Charset |
---|---|
2 |
|
3 |
import java.time.LocalTime |
4 |
import java.time.format.DateTimeFormatter |
5 |
import org.txm.utils.* |
6 |
|
7 |
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/sujets.xlsx") |
8 |
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs") |
9 |
|
10 |
def joinTRSColumn = "Lien notice principale" |
11 |
def typeColumns = ["Titre propre"] |
12 |
def topicColumns = ["Date de diffusion"] |
13 |
def startTimeColumn = "antract_debut" |
14 |
def endTimeColumn = "antract_fin" |
15 |
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"]
|
16 |
def metadataColumns = [
|
17 |
"Titre propre", "Date de diffusion", "Identifiant de la notice", "Notes du titre", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique", |
18 |
"Nom fichier segmenté (info)", "antract_video", "antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date", |
19 |
"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)"] |
20 |
def metadataColumnsGroups = [
|
21 |
"metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", "metadata", |
22 |
"secondary", "secondary", "secondary","secondary","secondary","secondary","secondary", |
23 |
"text", "text", "text", "text"] |
24 |
|
25 |
if (metadataColumns.size() != metadataColumnsGroups.size()) {
|
26 |
println "ERROR in metadata declarations&groups:"
|
27 |
println "COLUMNS: "+metadataColumns
|
28 |
println "GROUPS : "+metadataColumnsGroups
|
29 |
return
|
30 |
} |
31 |
|
32 |
if (!trsDirectory.exists()) {
|
33 |
println "$trsDirectory not found"
|
34 |
return
|
35 |
} |
36 |
|
37 |
println "Loading data from $metadataFile..."
|
38 |
TableReader reader = new TableReader(metadataFile)//, "\t".charAt(0), Charset.forName("UTF-8") |
39 |
reader.readHeaders() |
40 |
def header = reader.getHeaders()
|
41 |
if (!header.contains(joinTRSColumn)) {
|
42 |
println "No TRS ID $joinTRSColumn column found"
|
43 |
return
|
44 |
} |
45 |
if (!header.contains(startTimeColumn)) {
|
46 |
println "No start time $startTimeColumn column found"
|
47 |
return
|
48 |
} |
49 |
if (!header.contains(endTimeColumn)) {
|
50 |
println "No end time $endTimeColumn column found"
|
51 |
return
|
52 |
} |
53 |
for (def col : metadataColumns) { |
54 |
if (!header.contains(endTimeColumn)) {
|
55 |
println "No $col column found"
|
56 |
return
|
57 |
} |
58 |
} |
59 |
for (def col : typeColumns) { |
60 |
if (!header.contains(endTimeColumn)) {
|
61 |
println "No type $col column found"
|
62 |
return
|
63 |
} |
64 |
} |
65 |
for (def col : topicColumns) { |
66 |
if (!header.contains(endTimeColumn)) {
|
67 |
println "No topic $col column found"
|
68 |
return
|
69 |
} |
70 |
} |
71 |
|
72 |
File outputDirectory = new File(trsDirectory, "out") |
73 |
println "Writing result to $outputDirectory..."
|
74 |
|
75 |
dateTimeFormatter = DateTimeFormatter.ISO_LOCAL_TIME |
76 |
def strTotime(def str) { |
77 |
if (str.lastIndexOf(":") == -1) { |
78 |
return null |
79 |
} |
80 |
|
81 |
bonusFrame = Integer.parseInt(str.substring(str.lastIndexOf(":")+1)) |
82 |
//if (str.contains("135475")) println "ERROR $str in $infos -> $bonusFrame"
|
83 |
if (bonusFrame > 25) { |
84 |
bonusFrame=0;
|
85 |
} |
86 |
totalFrame = str.substring(0, str.lastIndexOf(":")) |
87 |
|
88 |
LocalTime time1 = LocalTime.parse(totalFrame, dateTimeFormatter) |
89 |
totalFrame = (time1.getHour()*60*60) + (time1.getMinute()*60) + time1.getSecond() |
90 |
|
91 |
def ret = totalFrame + (bonusFrame/25) |
92 |
return ret
|
93 |
} |
94 |
|
95 |
def sectionGroupsToInsert = [:]
|
96 |
println "Reading data..."
|
97 |
while (reader.readRecord()) {
|
98 |
String id = reader.get(joinTRSColumn).trim()
|
99 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4) |
100 |
if (id.length() == 0) continue; |
101 |
|
102 |
if (!sectionGroupsToInsert.containsKey(id)) {
|
103 |
sectionGroupsToInsert[id] = []
|
104 |
} |
105 |
def section = sectionGroupsToInsert[id]
|
106 |
|
107 |
if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections |
108 |
|
109 |
def m = [:]
|
110 |
|
111 |
for (def todo : ["topic":topicColumns, "type":typeColumns]) { |
112 |
def data = [] |
113 |
for (def col : todo.value) { |
114 |
if (reader.get(col).trim().length() > 0) { |
115 |
data << reader.get(col).trim().replace("\n", "") |
116 |
} |
117 |
} |
118 |
m[todo.key] = data.join("\t")
|
119 |
} |
120 |
def metadataList = [] |
121 |
def metadataGroupList = [] |
122 |
for (int i = 0 ; i < metadataColumns.size() ; i++) { |
123 |
def col = metadataColumns[i]
|
124 |
String c = AsciiUtils.buildAttributeId(col)
|
125 |
m[c] = reader.get(col) |
126 |
metadataList << c |
127 |
metadataGroupList << metadataColumnsGroups[i] |
128 |
} |
129 |
m["metadata"] = metadataList.join("|") |
130 |
m["metadata_groups"] = metadataGroupList.join("|") |
131 |
|
132 |
m["startTime"] = strTotime(reader.get(startTimeColumn))
|
133 |
m["endTime"] = strTotime(reader.get(endTimeColumn))
|
134 |
m["synchronized"] = "true" |
135 |
|
136 |
section << [m["startTime"], m["endTime"], m] |
137 |
} |
138 |
} |
139 |
|
140 |
println "Inserting sections... "+sectionGroupsToInsert.size()
|
141 |
|
142 |
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size())
|
143 |
for (String id : sectionGroupsToInsert.keySet()) { |
144 |
cpb.tick() |
145 |
File trsFile = new File(trsDirectory, id+".trs") |
146 |
if (!trsFile.exists()) {
|
147 |
continue
|
148 |
} |
149 |
//println "Processing $id..."
|
150 |
def sections = sectionGroupsToInsert[id]
|
151 |
sections = sections.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] } |
152 |
|
153 |
// Open input file
|
154 |
def slurper = new XmlParser(); |
155 |
slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) |
156 |
|
157 |
def trs = slurper.parse(trsFile.toURI().toString())
|
158 |
def trsEpisodes = trs.Episode // 1 |
159 |
if (trsEpisodes.size() > 1) { |
160 |
println "multiple Episode node in $trsFile"
|
161 |
continue
|
162 |
} |
163 |
def trsEpisode = trsEpisodes[0] |
164 |
def trsSections = trs.Episode.Section // 1 |
165 |
if (trsSections.size() > 1) { |
166 |
println "multiple Section node in $trsFile"
|
167 |
continue
|
168 |
} |
169 |
def trsSection = trsSections[0] |
170 |
|
171 |
def turns = trsSection.Turn
|
172 |
def newSections = [] |
173 |
def iSection = 0; |
174 |
def currentSection = null |
175 |
def currentNode = null |
176 |
|
177 |
for (def turn : turns) { |
178 |
def start = Float.parseFloat(turn.@startTime) |
179 |
def end = Float.parseFloat(turn.@endTime) |
180 |
|
181 |
def found = null; |
182 |
for (int i = iSection ; i < sections.size() ; i++) { |
183 |
if (end < sections[i][0]) { // Turn is before section |
184 |
|
185 |
} else if (sections[i][1] < start) { // Turn is before section |
186 |
|
187 |
} else {
|
188 |
found = sections[i] |
189 |
iSection = i |
190 |
break; // stop searching and set iSection to accelerate next search |
191 |
} |
192 |
} |
193 |
|
194 |
if (found == null) { |
195 |
if (currentSection != null || currentNode == null) { |
196 |
currentNode = new Node(trsEpisode, "Section", ["type":"Sujet non synchronisé", "topic":"none", "startTime":turn.@startTime, "endTime":"", "synchronized":"false"] ) |
197 |
currentSection = null;
|
198 |
} |
199 |
} else {
|
200 |
if (found != currentSection) {
|
201 |
if (currentNode != null && currentNode.@synchronized == "false") { |
202 |
def tmp = currentNode.Turn
|
203 |
currentNode.@endTime = tmp[-1].@endTime
|
204 |
} |
205 |
|
206 |
currentSection = found |
207 |
currentNode = new Node(trsEpisode, "Section", currentSection[2]) |
208 |
} |
209 |
} |
210 |
trsSection.remove(turn) |
211 |
currentNode.append(turn) |
212 |
} |
213 |
|
214 |
//remove the initial section
|
215 |
trsEpisode.remove(trsSection) |
216 |
|
217 |
outputDirectory.mkdir() |
218 |
File outfile = new File(outputDirectory, trsFile.getName()) |
219 |
outfile.withWriter("UTF-8") { writer ->
|
220 |
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
|
221 |
def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer)) |
222 |
printer.setPreserveWhitespace(true)
|
223 |
printer.print(trs) |
224 |
} |
225 |
} |
226 |
cpb.done() |
227 |
reader.close() |
228 |
println "Done."
|