Révision 3411
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddSectionsFromTableV2Macro.groovy (revision 3411) | ||
---|---|---|
1 |
package org.txm.macro.transcription |
|
2 |
|
|
3 |
import java.nio.charset.Charset |
|
4 |
|
|
5 |
import java.time.LocalTime |
|
6 |
import java.time.format.DateTimeFormatter |
|
7 |
|
|
8 |
import org.eclipse.core.internal.localstore.IsSynchronizedVisitor |
|
9 |
import org.txm.utils.* |
|
10 |
import org.txm.utils.logger.* |
|
11 |
|
|
12 |
@Field @Option(name="metadataFile", usage="Tableau des metadonnées de sections", widget="FileOpen", required=true, def="") |
|
13 |
File metadataFile; |
|
14 |
|
|
15 |
@Field @Option(name="trsDirectory", usage="Dossier qui contient les fichiers TRS", widget="Folder", required=true, def="") |
|
16 |
File trsDirectory; |
|
17 |
|
|
18 |
@Field @Option(name="joinTRSColumn", usage="Colonne de jointure de transcription", widget="String", required=true, def="Lien notice principale") |
|
19 |
def joinTRSColumn |
|
20 |
|
|
21 |
@Field @Option(name="startTimeColumn", usage="Colonne de timing de début de section", widget="String", required=true, def="antract_debut") |
|
22 |
def startTimeColumn = "antract_debut" |
|
23 |
|
|
24 |
@Field @Option(name="endTimeColumn", usage="Colonne de timing de fin de section", widget="String", required=true, def="antract_fin") |
|
25 |
def endTimeColumn = "antract_fin" |
|
26 |
|
|
27 |
@Field @Option(name="typeColumns", usage="Colonnes des métadonnées de type de section", widget="String", required=true, def="Titre propre") |
|
28 |
def typeColumns |
|
29 |
|
|
30 |
@Field @Option(name="topicColumns", usage="Colonnes des métadonnées de topic de section", widget="String", required=true, def="Date de diffusion") |
|
31 |
def topicColumns |
|
32 |
|
|
33 |
@Field @Option(name="metadataColumns", usage="Colonnes de metadonnées de section", widget="String", required=true, def="Titre propre;Date de diffusion;Identifiant de la notice;Notes du titre;Type de date;Durée;Genre;Langue VO / VE;Nature de production;Producteurs (Aff.);Thématique;Nom fichier segmenté (info);antract_video;antract_debut;antract_fin;antract_duree;antract_tc_type;antract_tc_date;Résumé;Séquences;Descripteurs (Aff. Lig.);Générique (Aff. Lig.)") |
|
34 |
def metadataColumns |
|
35 |
|
|
36 |
@Field @Option(name="metadataColumnsGroups", usage="Colonnes des gruopes de metadonnées de section", widget="String", required=true, def="metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text") |
|
37 |
def metadataColumnsGroups |
|
38 |
|
|
39 |
@Field @Option(name="fixSectionsLimits", usage="Correction des limites de sections du tableau de metadonnees", widget="Boolean", required=true, def="true") |
|
40 |
def fixSectionsLimits |
|
41 |
|
|
42 |
@Field @Option(name="sectionsMergeActivationThreashold", usage="marge d'erreur de corrections des limites de sections", widget="Float", required=true, def="1.0") |
|
43 |
def sectionsMergeActivationThreashold |
|
44 |
|
|
45 |
@Field @Option(name="fixTurnsLimits", usage="Correction des limites de sections du tableau de metadonnees", widget="Boolean", required=true, def="true") |
|
46 |
def fixTurnsLimits |
|
47 |
|
|
48 |
@Field @Option(name="turnsCutActivationThreashold", usage="marge d'erreur de corrections des limites de tours", widget="Float", required=true, def="0.1") |
|
49 |
def turnsCutActivationThreashold |
|
50 |
|
|
51 |
@Field @Option(name="debug", usage="show debug messages", widget="String", required=true, def="false") |
|
52 |
def debug |
|
53 |
|
|
54 |
if (!ParametersDialog.open(this)) return; |
|
55 |
debug = true |
|
56 |
|
|
57 |
typeColumns = typeColumns.split(";") |
|
58 |
topicColumns = topicColumns.split(";") |
|
59 |
metadataColumns = metadataColumns.split(";") |
|
60 |
metadataColumnsGroups = metadataColumnsGroups.split(";") |
|
61 |
|
|
62 |
if (metadataColumns.size() != metadataColumnsGroups.size()) { |
|
63 |
println "ERROR in metadata declarations&groups:" |
|
64 |
println "COLUMNS: "+metadataColumns |
|
65 |
println "GROUPS : "+metadataColumnsGroups |
|
66 |
return |
|
67 |
} |
|
68 |
|
|
69 |
if (!trsDirectory.exists()) { |
|
70 |
println "$trsDirectory not found" |
|
71 |
return |
|
72 |
} |
|
73 |
|
|
74 |
println "Loading data from $metadataFile..." |
|
75 |
TableReader reader = new TableReader(metadataFile)//, "\t".charAt(0), Charset.forName("UTF-8") |
|
76 |
reader.readHeaders() |
|
77 |
def header = reader.getHeaders() |
|
78 |
if (!header.contains(joinTRSColumn)) { |
|
79 |
println "No TRS ID $joinTRSColumn column found" |
|
80 |
return |
|
81 |
} |
|
82 |
if (!header.contains(startTimeColumn)) { |
|
83 |
println "No start time $startTimeColumn column found" |
|
84 |
return |
|
85 |
} |
|
86 |
if (!header.contains(endTimeColumn)) { |
|
87 |
println "No end time $endTimeColumn column found" |
|
88 |
return |
|
89 |
} |
|
90 |
for (def col : metadataColumns) { |
|
91 |
if (!header.contains(endTimeColumn)) { |
|
92 |
println "No $col column found" |
|
93 |
return |
|
94 |
} |
|
95 |
} |
|
96 |
for (def col : typeColumns) { |
|
97 |
if (!header.contains(endTimeColumn)) { |
|
98 |
println "No type $col column found" |
|
99 |
return |
|
100 |
} |
|
101 |
} |
|
102 |
for (def col : topicColumns) { |
|
103 |
if (!header.contains(endTimeColumn)) { |
|
104 |
println "No topic $col column found" |
|
105 |
return |
|
106 |
} |
|
107 |
} |
|
108 |
|
|
109 |
File outputDirectory = new File(trsDirectory, "out") |
|
110 |
println "Writing result to $outputDirectory..." |
|
111 |
|
|
112 |
dateTimeFormatter = DateTimeFormatter.ISO_LOCAL_TIME |
|
113 |
def strTotime(def str) { |
|
114 |
if (str.lastIndexOf(":") == -1) { |
|
115 |
return null |
|
116 |
} |
|
117 |
|
|
118 |
bonusFrame = Integer.parseInt(str.substring(str.lastIndexOf(":")+1)) |
|
119 |
//if (str.contains("135475")) println "ERROR $str in $infos -> $bonusFrame" |
|
120 |
if (bonusFrame > 25) { |
|
121 |
bonusFrame=0; |
|
122 |
} |
|
123 |
totalFrame = str.substring(0, str.lastIndexOf(":")) |
|
124 |
|
|
125 |
LocalTime time1 = LocalTime.parse(totalFrame, dateTimeFormatter) |
|
126 |
totalFrame = (time1.getHour()*60*60) + (time1.getMinute()*60) + time1.getSecond() |
|
127 |
|
|
128 |
def ret = totalFrame + (bonusFrame/25) |
|
129 |
return ret |
|
130 |
} |
|
131 |
|
|
132 |
try { |
|
133 |
def sectionGroupsToInsert = [:] |
|
134 |
println "Reading data..." |
|
135 |
while (reader.readRecord()) { // loading & sorting sections |
|
136 |
String id = reader.get(joinTRSColumn).trim() |
|
137 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4) |
|
138 |
if (id.length() == 0) continue; |
|
139 |
|
|
140 |
if (!sectionGroupsToInsert.containsKey(id)) { |
|
141 |
sectionGroupsToInsert[id] = [] |
|
142 |
} |
|
143 |
def section = sectionGroupsToInsert[id] |
|
144 |
|
|
145 |
if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections |
|
146 |
|
|
147 |
def m = [:] |
|
148 |
|
|
149 |
for (def todo : ["topic":topicColumns, "type":typeColumns]) { |
|
150 |
def data = [] |
|
151 |
for (def col : todo.value) { |
|
152 |
if (reader.get(col).trim().length() > 0) { |
|
153 |
data << reader.get(col).trim().replace("\n", "") |
|
154 |
} |
|
155 |
} |
|
156 |
m[todo.key] = data.join("\t") |
|
157 |
} |
|
158 |
def metadataList = [] |
|
159 |
def metadataGroupList = [] |
|
160 |
for (int i = 0 ; i < metadataColumns.size() ; i++) { |
|
161 |
def col = metadataColumns[i] |
|
162 |
String c = AsciiUtils.buildAttributeId(col) |
|
163 |
m[c] = reader.get(col) |
|
164 |
metadataList << c |
|
165 |
metadataGroupList << metadataColumnsGroups[i] |
|
166 |
} |
|
167 |
m["metadata"] = metadataList.join("|") |
|
168 |
m["metadata_groups"] = metadataGroupList.join("|") |
|
169 |
|
|
170 |
m["startTime"] = strTotime(reader.get(startTimeColumn)) |
|
171 |
m["endTime"] = strTotime(reader.get(endTimeColumn)) |
|
172 |
m["synchronized"] = "true" |
|
173 |
|
|
174 |
section << [m["startTime"], m["endTime"], m] |
|
175 |
} |
|
176 |
} |
|
177 |
|
|
178 |
println "Inserting sections... "+sectionGroupsToInsert.size() |
|
179 |
|
|
180 |
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size()) |
|
181 |
for (String id : sectionGroupsToInsert.keySet()) { |
|
182 |
|
|
183 |
File trsFile = new File(trsDirectory, id+".trs") |
|
184 |
if (!trsFile.exists()) { |
|
185 |
cpb.tick() |
|
186 |
continue |
|
187 |
} |
|
188 |
|
|
189 |
if (debug) println "== $id ==" |
|
190 |
else cpb.tick() |
|
191 |
|
|
192 |
//println "Processing $id..." |
|
193 |
sections = sectionGroupsToInsert[id] |
|
194 |
sections = sections.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> -b[1] } // negative second test for sections inclusion |
|
195 |
|
|
196 |
if (fixSectionsLimits) { |
|
197 |
if (debug) println "Fixing sections of $id" |
|
198 |
for (int iSection = 1 ; iSection < sections.size() ; iSection++) { |
|
199 |
//println sections[iSection] |
|
200 |
if (Math.abs(sections[iSection][0] - sections[iSection - 1][1]) < sectionsMergeActivationThreashold) { |
|
201 |
if (debug) println "s=$iSection start <- end : "+sections[iSection][0]+ " <- "+sections[iSection - 1][1] |
|
202 |
sections[iSection][0] = sections[iSection - 1][1] // fix the start time with the previous section end time |
|
203 |
} |
|
204 |
} |
|
205 |
} |
|
206 |
|
|
207 |
// Open input file |
|
208 |
slurper = new groovy.util.XmlParser(false, true, true); |
|
209 |
slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) // allow DTD declaration |
|
210 |
slurper.setProperty("http://javax.xml.XMLConstants/property/accessExternalDTD", "all"); // allow to read DTD from local file |
|
211 |
trs = slurper.parse(trsFile.toURI().toString()) |
|
212 |
trsEpisodes = trs.Episode // 1 |
|
213 |
if (trsEpisodes.size() > 1) { |
|
214 |
println "Error: multiple Episode node in $trsFile" |
|
215 |
continue |
|
216 |
} |
|
217 |
def trsEpisode = trsEpisodes[0] |
|
218 |
def trsSections = trs.Episode.Section // 1 |
|
219 |
if (trsSections.size() > 1) { |
|
220 |
println "Error: multiple Section node in $trsFile" |
|
221 |
continue |
|
222 |
} |
|
223 |
|
|
224 |
trsSection = trsSections[0] |
|
225 |
|
|
226 |
turns = trsSection.Turn |
|
227 |
newSections = [] |
|
228 |
iSection = 0; |
|
229 |
currentSection = null |
|
230 |
currentNode = null |
|
231 |
|
|
232 |
foundSection=null |
|
233 |
isTurnSynchronized=false |
|
234 |
cutCheck=false |
|
235 |
|
|
236 |
// boucle sur les tours dans l'ordre |
|
237 |
for (iTurn = 0 ; iTurn < turns.size() ; iTurn++) { |
|
238 |
|
|
239 |
turn = turns[iTurn] |
|
240 |
start = Float.parseFloat(turn.@startTime) |
|
241 |
end = Float.parseFloat(turn.@endTime) |
|
242 |
//println "Turn: $iTurn ($start, $end)" |
|
243 |
|
|
244 |
// Etape 1 : y aura-t-il besoin de couper le tour, et dans quelle section est le tour (ou sa première partie) |
|
245 |
foundSection = null; |
|
246 |
for (int i = iSection ; i < sections.size() ; i++) { |
|
247 |
// if section_end < turn_start OU |turn_start - section_end| < turn_threshold |
|
248 |
if (sections[i][1] < start || Math.abs(start - sections[i][1]) < turnsCutActivationThreashold) { // Turn is before section |
|
249 |
// Cas 1 : la section est complètement avant (modulo la marge) |
|
250 |
} else { |
|
251 |
// Cas 2 : on est arrivés à la section à considérer |
|
252 |
iSection = i |
|
253 |
// if section_start > turn_end OU |section_start - turn_end| < turn_threshold |
|
254 |
if (sections[i][0] > end || Math.abs(sections[i][0] - end) < turnsCutActivationThreashold) { // Section is before section |
|
255 |
// Cas 2.1 : la section est complètement après (modulo la marge) (et les suivantes le seront aussi) |
|
256 |
foundSection=null |
|
257 |
isTurnSynchronized=false |
|
258 |
cutCheck=false |
|
259 |
} else { |
|
260 |
// if |section_start - turn_start| > turn_threshold |
|
261 |
if (Math.abs(start - sections[i][0]) > turnsCutActivationThreashold) { // Section is before section |
|
262 |
// Cas 2.2 : la section commence significativement après le début du tour (le début est non synchronisé) |
|
263 |
foundSection=sections[i] // (c'est la première section rencontrée, mais elle sera pour le tour suivant) |
|
264 |
isTurnSynchronized=false |
|
265 |
cutCheck=true |
|
266 |
} else { |
|
267 |
// Cas 2.3 : le début du tour est dans la section (on n'a pas besoin de chercher d'autres sections car si ce n'est pas la seule on coupera le tour et ce sera un autre tour). |
|
268 |
foundSection=sections[i] // (c'est la section qui commence le tour, au moins) |
|
269 |
isTurnSynchronized=true |
|
270 |
cutCheck=true |
|
271 |
} |
|
272 |
} |
|
273 |
break; // stop searching and set iSection to accelerate next search |
|
274 |
} |
|
275 |
} |
|
276 |
|
|
277 |
// Etape 3 : positionne *le* tour dans *le* noeud (cf. ~ l.247-264 ?) |
|
278 |
// (on ne gère qu'un seul tour et un seul noeud à chaque itération de la boucle tour, |
|
279 |
// puisqu'on a retaillé le tour pour qu'il ne concerne pas plusieurs noeuds) |
|
280 |
if (foundSection != null) { // on complète ou on ajoute une div. |
|
281 |
|
|
282 |
if (foundSection != currentSection || currentSection == null) { |
|
283 |
if (currentNode != null && currentNode.@synchronized == "false") { |
|
284 |
def tmp = currentNode.Turn |
|
285 |
currentNode.@endTime = tmp[-1].@endTime |
|
286 |
} |
|
287 |
|
|
288 |
currentSection = foundSection |
|
289 |
currentNode = new Node(trsEpisode, "Section", currentSection[2]) |
|
290 |
} |
|
291 |
} else { // on complète ou on ajoute un noeud (div) non synchronisé. |
|
292 |
|
|
293 |
currentNode = new Node(trsEpisode, "Section", ["type":"Sujet non synchronisé", "startTime":sections[iSection][0], "endTime":sections[iSection][1], "synchronized":"false"] ) |
|
294 |
currentSection = null; |
|
295 |
} |
|
296 |
|
|
297 |
// Etape 2 : on coupe le tour s'il y a besoin (cf. ~l.267-315 ?) |
|
298 |
if (cutCheck) { |
|
299 |
if (isTurnSynchronized) { |
|
300 |
cutTurn(true) // iSection++ quand on coupe |
|
301 |
} else { |
|
302 |
if (foundSection != null) { |
|
303 |
cutTurn(false) |
|
304 |
} |
|
305 |
} |
|
306 |
} |
|
307 |
|
|
308 |
|
|
309 |
|
|
310 |
trsSection.remove(turn) |
|
311 |
currentNode.append(turn) |
|
312 |
} |
|
313 |
|
|
314 |
outputDirectory.mkdir() |
|
315 |
File outfile = new File(outputDirectory, trsFile.getName()) |
|
316 |
outfile.withWriter("UTF-8") { writer -> |
|
317 |
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n') |
|
318 |
def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer)) |
|
319 |
printer.setPreserveWhitespace(true) |
|
320 |
printer.print(trs) |
|
321 |
} |
|
322 |
} |
|
323 |
cpb.done() |
|
324 |
reader.close() |
|
325 |
println "Done." |
|
326 |
|
|
327 |
} catch(Exception e) { |
|
328 |
println "Error: "+e |
|
329 |
Log.printStackTrace(e) |
|
330 |
} |
|
331 |
|
|
332 |
def cutTurn(def incrementISection) { |
|
333 |
def children = turn.children() |
|
334 |
Node newTurnKaNode = null;//new Node(trsEpisode, "Turn", currentSection[2]) |
|
335 |
|
|
336 |
def startSection = currentNode.@startTime |
|
337 |
if (startSection instanceof String) startSection = Float.parseFloat(currentNode.@startTime) |
|
338 |
def endSection = currentNode.@endTime |
|
339 |
if (endSection instanceof String) endSection = Float.parseFloat(currentNode.@endTime) |
|
340 |
//println "Cut the last turn if necessary" |
|
341 |
for (int iChildren = 0 ; iChildren < children.size() ; iChildren++) { |
|
342 |
|
|
343 |
def c = children[iChildren] |
|
344 |
if (c instanceof String) continue; |
|
345 |
|
|
346 |
if (newTurnKaNode != null) { // append the remaining children to the new turn |
|
347 |
turn.remove(c) |
|
348 |
newTurnKaNode.append(c) |
|
349 |
if (debug) c.@moved="yes" |
|
350 |
iChildren-- |
|
351 |
} else { |
|
352 |
if ("w".equals(c.name())) { |
|
353 |
def start2 = Float.parseFloat(c.@startTime) |
|
354 |
def end2 = Float.parseFloat(c.@endTime) |
|
355 |
|
|
356 |
if (start2 > endSection && Math.abs(start2 - endSection) > turnsCutActivationThreashold) { |
|
357 |
if (debug) println "cut with a w at ($start2, $end2) for section ("+startSection+", "+endSection+")" |
|
358 |
newTurnKaNode = new Node(trsSection, "Turn", ["startTime":""+start2, "endTime":""+turn.@endTime, "speaker":turn.@speaker]) |
|
359 |
turns.add(iTurn, newTurnKaNode) |
|
360 |
iTurn-- |
|
361 |
if (debug) newTurnKaNode.@created = "yes" |
|
362 |
turn.@endTime = ""+start2; |
|
363 |
turn.remove(c) |
|
364 |
newTurnKaNode.append(c) |
|
365 |
|
|
366 |
if (debug) c.@moved="yes" |
|
367 |
if (incrementISection) iSection++ |
|
368 |
iChildren-- |
|
369 |
} |
|
370 |
} else if ("Sync".equals(c.name())) { |
|
371 |
def start2 = c.@time |
|
372 |
if (start2 instanceof String) start2 = Float.parseFloat(c.@time) |
|
373 |
def end2 = start2 |
|
374 |
|
|
375 |
if (start2 > endSection && Math.abs(start2 - endSection) > turnsCutActivationThreashold) { |
|
376 |
if (debug) println "cut with a Sync at ($start2, $end2) for section ("+startSection+", "+endSection+")" |
|
377 |
newTurnKaNode = new Node(trsSection, "Turn", ["startTime":""+start2, "endTime":""+turn.@endTime, "speaker":turn.@speaker]) |
|
378 |
turns.add(iTurn, newTurnKaNode) |
|
379 |
iTurn-- |
|
380 |
if (debug) newTurnKaNode.@created = "yes" |
|
381 |
turn.@endTime = ""+start2; |
|
382 |
turn.remove(c) |
|
383 |
if (debug) c.@moved="yes" |
|
384 |
if (incrementISection) iSection++ |
|
385 |
newTurnKaNode.append(c) |
|
386 |
iChildren-- |
|
387 |
} |
|
388 |
} else { |
|
389 |
// no time to check |
|
390 |
} |
|
391 |
} |
|
392 |
} |
|
393 |
} |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddSectionsFromTableMacro.groovy (revision 3411) | ||
---|---|---|
33 | 33 |
|
34 | 34 |
@Field @Option(name="metadataColumnsGroups", usage="Colonnes des gruopes de metadonnées de section", widget="String", required=true, def="metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text") |
35 | 35 |
def metadataColumnsGroups |
36 |
|
|
36 |
|
|
37 | 37 |
@Field @Option(name="fixSectionsLimits", usage="Correction des limites de sections du tableau de metadonnees", widget="Boolean", required=true, def="true") |
38 | 38 |
def fixSectionsLimits |
39 |
|
|
39 |
|
|
40 | 40 |
@Field @Option(name="sectionsMergeActivationThreashold", usage="marge d'erreur de corrections des limites de sections", widget="Float", required=true, def="1.0") |
41 | 41 |
def sectionsMergeActivationThreashold |
42 | 42 |
|
43 |
@Field @Option(name="turnsMergeActivationThreashold", usage="marge d'erreur de corrections des limites de tours", widget="Float", required=true, def="0.1")
|
|
44 |
def turnsMergeActivationThreashold
|
|
43 |
@Field @Option(name="fixTurnsLimits", usage="Correction des limites de sections du tableau de metadonnees", widget="Boolean", required=true, def="true")
|
|
44 |
def fixTurnsLimits
|
|
45 | 45 |
|
46 |
@Field @Option(name="turnsCutActivationThreashold", usage="marge d'erreur de corrections des limites de tours", widget="Float", required=true, def="0.1") |
|
47 |
def turnsCutActivationThreashold |
|
48 |
|
|
46 | 49 |
@Field @Option(name="debug", usage="show debug messages", widget="String", required=true, def="false") |
47 | 50 |
def debug |
48 | 51 |
|
49 | 52 |
if (!ParametersDialog.open(this)) return; |
50 |
debug = "true".equals(debug);
|
|
53 |
debug = true
|
|
51 | 54 |
|
52 | 55 |
typeColumns = typeColumns.split(";") |
53 | 56 |
topicColumns = topicColumns.split(";") |
... | ... | |
127 | 130 |
try { |
128 | 131 |
def sectionGroupsToInsert = [:] |
129 | 132 |
println "Reading data..." |
130 |
while (reader.readRecord()) { |
|
133 |
while (reader.readRecord()) { // loading & sorting sections
|
|
131 | 134 |
String id = reader.get(joinTRSColumn).trim() |
132 | 135 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4) |
133 | 136 |
if (id.length() == 0) continue; |
... | ... | |
175 | 178 |
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size()) |
176 | 179 |
for (String id : sectionGroupsToInsert.keySet()) { |
177 | 180 |
|
178 |
|
|
179 | 181 |
File trsFile = new File(trsDirectory, id+".trs") |
180 | 182 |
if (!trsFile.exists()) { |
181 | 183 |
cpb.tick() |
... | ... | |
193 | 195 |
if (debug) println "Fixing sections of $id" |
194 | 196 |
for (int iSection = 1 ; iSection < sections.size() ; iSection++) { |
195 | 197 |
//println sections[iSection] |
196 |
if (sections[iSection][0] - sections[iSection - 1][1] > sectionsMergeActivationThreashold) {
|
|
198 |
if (Math.abs(sections[iSection][0] - sections[iSection - 1][1]) < sectionsMergeActivationThreashold) {
|
|
197 | 199 |
if (debug) println "s=$iSection start <- end : "+sections[iSection][0]+ " <- "+sections[iSection - 1][1] |
198 | 200 |
sections[iSection][0] = sections[iSection - 1][1] // fix the start time with the previous section end time |
199 | 201 |
} |
... | ... | |
207 | 209 |
def trs = slurper.parse(trsFile.toURI().toString()) |
208 | 210 |
def trsEpisodes = trs.Episode // 1 |
209 | 211 |
if (trsEpisodes.size() > 1) { |
210 |
println "multiple Episode node in $trsFile" |
|
212 |
println "Error: multiple Episode node in $trsFile"
|
|
211 | 213 |
continue |
212 | 214 |
} |
213 | 215 |
def trsEpisode = trsEpisodes[0] |
214 | 216 |
def trsSections = trs.Episode.Section // 1 |
215 | 217 |
if (trsSections.size() > 1) { |
216 |
println "multiple Section node in $trsFile" |
|
218 |
println "Error: multiple Section node in $trsFile"
|
|
217 | 219 |
continue |
218 | 220 |
} |
219 | 221 |
def trsSection = trsSections[0] |
... | ... | |
231 | 233 |
def end = Float.parseFloat(turn.@endTime) |
232 | 234 |
//println "Turn: $iTurn ($start, $end)" |
233 | 235 |
|
234 |
def found = null; |
|
236 |
def foundSection = null;
|
|
235 | 237 |
for (int i = iSection ; i < sections.size() ; i++) { |
236 | 238 |
if (end < sections[i][0]) { // Turn is before section |
237 | 239 |
|
238 |
} else if (sections[i][1] < start) { // Turn is before section
|
|
240 |
} else if (sections[i][1] < start) { // Section is before section
|
|
239 | 241 |
|
240 | 242 |
} else { |
241 |
found = sections[i] |
|
243 |
foundSection = sections[i]
|
|
242 | 244 |
iSection = i |
243 | 245 |
break; // stop searching and set iSection to accelerate next search |
244 | 246 |
} |
245 | 247 |
} |
246 | 248 |
|
247 |
if (found == null) { |
|
249 |
if (foundSection == null) {
|
|
248 | 250 |
if (currentSection != null || currentNode == null) { |
249 | 251 |
currentNode = new Node(trsEpisode, "Section", ["type":"Sujet non synchronisé", "startTime":turn.@startTime, "endTime":"", "synchronized":"false"] ) |
250 | 252 |
currentSection = null; |
251 | 253 |
} |
252 | 254 |
} else { |
253 |
if (found != currentSection) { |
|
255 |
if (foundSection != currentSection) {
|
|
254 | 256 |
if (currentNode != null && currentNode.@synchronized == "false") { |
255 | 257 |
def tmp = currentNode.Turn |
256 | 258 |
currentNode.@endTime = tmp[-1].@endTime |
257 | 259 |
} |
258 | 260 |
|
259 |
currentSection = found |
|
261 |
currentSection = foundSection
|
|
260 | 262 |
currentNode = new Node(trsEpisode, "Section", currentSection[2]) |
261 | 263 |
} |
262 | 264 |
} |
265 |
|
|
263 | 266 |
trsSection.remove(turn) |
264 | 267 |
currentNode.append(turn) |
265 |
|
|
266 |
|
|
267 |
if (found) { |
|
268 |
} |
|
269 |
|
|
270 |
//remove the initial section which is empty now or not |
|
271 |
trsEpisode.remove(trsSection) |
|
272 |
|
|
273 |
if (fixTurnsLimits) { |
|
274 |
if (debug) println "Fixing Turn limits..." |
|
275 |
def turnToInsert = null; |
|
276 |
sections = trs.Episode.Section |
|
277 |
for (int i = 0 ; i < sections.size() -1 ; i++) { // browse created sections but stop before the last one (whichc can not be fixed) |
|
278 |
|
|
279 |
def section = sections[i] |
|
280 |
if (turnToInsert != null) { |
|
281 |
if (debug) println "Moving part-of turn: "+turnToInsert+" in section ("+section.@startTime+", "+section.@endTime+")" |
|
282 |
section.children().add(0, turnToInsert) // insert the slited part of the turn in the section |
|
283 |
} |
|
284 |
|
|
285 |
def startSection = section.@startTime // Float.parseFloat(section.@startTime) |
|
286 |
def endSection = section.@endTime // Float.parseFloat(section.@endTime) |
|
287 |
if (startSection instanceof String) startSection = Float.parseFloat(section.@startTime) |
|
288 |
if (endSection instanceof String) endSection = Float.parseFloat(section.@endTime) |
|
289 |
|
|
290 |
turnToInsert = null |
|
291 |
|
|
292 |
turns = section.Turn |
|
293 |
if (turns.size() == 0) continue; |
|
294 |
|
|
295 |
def turn = turns[-1] |
|
296 |
|
|
268 | 297 |
def children = turn.children() |
269 | 298 |
Node newTurnKaNode = null;//new Node(trsEpisode, "Turn", currentSection[2]) |
270 | 299 |
//println "Cut the last turn if necessary" |
271 | 300 |
for (int iChildren = 0 ; iChildren < children.size() ; iChildren++) { |
301 |
|
|
272 | 302 |
def c = children[iChildren] |
303 |
if (c instanceof String) continue; |
|
304 |
|
|
273 | 305 |
if (newTurnKaNode != null) { |
274 | 306 |
turn.remove(c) |
275 | 307 |
newTurnKaNode.append(c) |
... | ... | |
280 | 312 |
def start2 = Float.parseFloat(c.@startTime) |
281 | 313 |
def end2 = Float.parseFloat(c.@endTime) |
282 | 314 |
|
283 |
if (start2 > sections[iSection][1] && Math.abs(start2 - sections[iSection][1]) > turnsMergeActivationThreashold) {
|
|
284 |
if (debug) println "cut with w at [$start2, $end2] for section "+sections[iSection][1]
|
|
315 |
if (start2 > endSection && Math.abs(start2 - endSection) > turnsCutActivationThreashold) {
|
|
316 |
if (debug) println "cut with a w at [$start2, $end2] for section ("+startSection+", "+endSection+")"
|
|
285 | 317 |
newTurnKaNode = new Node(trsSection, "Turn", ["startTime":""+start2, "endTime":""+turn.@endTime, "speaker":turn.@speaker]) |
286 | 318 |
if (debug) newTurnKaNode.@created = "yes" |
287 | 319 |
turn.@endTime = ""+start2; |
288 | 320 |
turn.remove(c) |
289 | 321 |
newTurnKaNode.append(c) |
290 | 322 |
if (debug) c.@moved="yes" |
291 |
turns.add(iTurn+1, newTurnKaNode) |
|
292 | 323 |
iChildren-- |
293 | 324 |
} |
294 | 325 |
} else if ("Sync".equals(c.name())) { |
295 | 326 |
def start2 = Float.parseFloat(c.@time) |
296 | 327 |
def end2 = Float.parseFloat(c.@time) |
297 | 328 |
|
298 |
if (start2 > sections[iSection][1] && Math.abs(start2 - sections[iSection][1]) > turnsMergeActivationThreashold) {
|
|
299 |
if (debug) println "cut with Sync at [$start2, $end2] for section "+sections[iSection][1]
|
|
329 |
if (start2 > endSection && Math.abs(start2 - endSection) > turnsCutActivationThreashold) {
|
|
330 |
if (debug) println "cut with a Sync at [$start2, $end2] for section "+endSection
|
|
300 | 331 |
newTurnKaNode = new Node(trsSection, "Turn", ["startTime":""+start2, "endTime":""+turn.@endTime, "speaker":turn.@speaker]) |
301 | 332 |
if (debug) newTurnKaNode.@created = "yes" |
302 | 333 |
turn.@endTime = ""+start2; |
303 | 334 |
turn.remove(c) |
304 | 335 |
if (debug) c.@moved="yes" |
305 | 336 |
newTurnKaNode.append(c) |
306 |
turns.add(iTurn+1, newTurnKaNode) |
|
307 | 337 |
iChildren-- |
308 | 338 |
} |
309 | 339 |
} else { |
... | ... | |
311 | 341 |
} |
312 | 342 |
} |
313 | 343 |
} |
344 |
|
|
345 |
turnToInsert = newTurnKaNode |
|
346 |
|
|
314 | 347 |
} |
315 | 348 |
} |
316 | 349 |
|
317 |
//remove the initial section |
|
318 |
trsEpisode.remove(trsSection) |
|
319 |
|
|
320 | 350 |
outputDirectory.mkdir() |
321 | 351 |
File outfile = new File(outputDirectory, trsFile.getName()) |
322 | 352 |
outfile.withWriter("UTF-8") { writer -> |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/prototypes/nlp/TT2XMLInDirectoryMacro.groovy (revision 3411) | ||
---|---|---|
1 |
package org.txm.macro.nlp; |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcpapplication.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
|
|
10 |
@Field @Option(name="inputDirectory", usage="TXT directory", widget="Folder", required=true, def="dir") |
|
11 |
File inputDirectory |
|
12 |
|
|
13 |
@Field @Option(name="encoding", usage="File encoding", widget="String", required=false, def="UTF-8") |
|
14 |
String encoding |
|
15 |
|
|
16 |
@Field @Option(name="debug", usage="Debug mode", widget="Boolean", required=false, def="false") |
|
17 |
Boolean debug |
|
18 |
|
|
19 |
if (!ParametersDialog.open(this)) return |
|
20 |
|
|
21 |
encoding = encoding.trim() |
|
22 |
outputDirectory = new File(inputDirectory, "xml") |
|
23 |
outputDirectory.mkdir() |
|
24 |
|
|
25 |
println "Processing: "+inputDirectory |
|
26 |
|
|
27 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
28 |
|
|
29 |
def files = inputDirectory.listFiles().sort{ it.name } |
|
30 |
if (files == null || files.length == 0) { |
|
31 |
println "Error: no file to process in $inputDirectory" |
|
32 |
return false; |
|
33 |
} |
|
34 |
for (File inputfile : files.sort()) { |
|
35 |
if (inputfile.isDirectory() || inputfile.isHidden() || !inputfile.getName().endsWith(".tt")) continue // ignore |
|
36 |
println " file: "+inputfile |
|
37 |
|
|
38 |
name = inputfile.getName() |
|
39 |
idx = name.lastIndexOf(".") |
|
40 |
if (idx > 0) name = name.substring(0, idx) |
|
41 |
outputfile = new File(outputDirectory, name+".xml") |
|
42 |
|
|
43 |
Writer output = new OutputStreamWriter(new FileOutputStream(outputfile) , "UTF-8") |
|
44 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output) |
|
45 |
|
|
46 |
writer.writeStartDocument("UTF-8","1.0") |
|
47 |
writer.writeCharacters("\n") |
|
48 |
writer.writeStartElement("text") |
|
49 |
writer.writeCharacters("\n") |
|
50 |
|
|
51 |
inSent = 0 |
|
52 |
nLine = 0 |
|
53 |
inputfile.eachLine(encoding) { line -> |
|
54 |
|
|
55 |
nLine++ |
|
56 |
|
|
57 |
if (debug) { |
|
58 |
println nLine+": "+line |
|
59 |
} |
|
60 |
|
|
61 |
def sent = false |
|
62 |
def sent_pattern = ~/^([^\t]+)\t(SENT)\t([^\t]+)$/ |
|
63 |
def sent_m = (line =~ sent_pattern) |
|
64 |
if (sent_m) sent = true |
|
65 |
|
|
66 |
if (sent && inSent) { |
|
67 |
writer.writeStartElement("w") |
|
68 |
writer.writeAttribute("frpos", sent_m[0][2]) |
|
69 |
writer.writeAttribute("frlemma", sent_m[0][3]) |
|
70 |
writer.writeCharacters(sent_m[0][3]) |
|
71 |
writer.writeEndElement() // close w |
|
72 |
writer.writeComment("\n") |
|
73 |
writer.writeEndElement() // close s |
|
74 |
writer.writeEndElement() // close p (sent) |
|
75 |
writer.writeCharacters("\n") |
|
76 |
inSent=0 |
|
77 |
} else { |
|
78 |
if (!inSent) { |
|
79 |
writer.writeStartElement("p") |
|
80 |
writer.writeStartElement("s") |
|
81 |
inSent=1 |
|
82 |
} |
|
83 |
def w_pattern = ~/^([^\t]+)\t([^\t]+)\t([^\t]+)$/ |
|
84 |
def w_m = (line =~ w_pattern) |
|
85 |
|
|
86 |
if (w_m.size() > 0) { |
|
87 |
writer.writeStartElement("w") |
|
88 |
writer.writeAttribute("frpos", w_m[0][2]) |
|
89 |
writer.writeAttribute("frlemma", w_m[0][3]) |
|
90 |
writer.writeCharacters(w_m[0][1]) |
|
91 |
writer.writeEndElement() // close w |
|
92 |
writer.writeCharacters("\n") |
|
93 |
} else { |
|
94 |
writer.writeStartElement("w") |
|
95 |
writer.writeAttribute("frpos", "NAM") |
|
96 |
writer.writeAttribute("frlemma", "<unknown>") |
|
97 |
writer.writeCharacters(line) |
|
98 |
writer.writeEndElement() // close w |
|
99 |
writer.writeCharacters("\n") |
|
100 |
println "** TT2XMLInDirectory: unknown [word] line pattern, "+nLine+": <"+line+">." |
|
101 |
} |
|
102 |
} |
|
103 |
} |
|
104 |
|
|
105 |
if (inSent) { |
|
106 |
writer.writeEndElement() // s |
|
107 |
writer.writeEndElement() // p |
|
108 |
writer.writeCharacters("\n") |
|
109 |
} |
|
110 |
writer.writeEndElement() |
|
111 |
writer.writeCharacters("\n") |
|
112 |
writer.close() |
|
113 |
output.close() |
|
114 |
} |
|
115 |
|
|
116 |
|
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/PrepareAFVOIXOFFCorpusMacro.groovy (revision 3411) | ||
---|---|---|
43 | 43 |
, "topicColumns": "Date de diffusion" |
44 | 44 |
, "metadataColumns": "id;Titre propre;Date de diffusion;Identifiant de la notice;Notes du titre;Type de date;Durée;Genre;Langue VO / VE;Nature de production;Producteurs (Aff.);Thématique;Nom fichier segmenté (info);antract_video;antract_debut;antract_fin;antract_duree;antract_tc_type;antract_tc_date;Résumé;Séquences;Descripteurs (Aff. Lig.);Générique (Aff. Lig.)" |
45 | 45 |
, "metadataColumnsGroups": "secondary;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text" |
46 |
, "sectionsMergeActivationThreashold":1.0 |
|
47 |
, "turnsMergeActivationThreashold":0.1 |
|
46 |
, "fixSectionsLimits":true |
|
47 |
, "sectionsMergeActivationThreashold":4.0 |
|
48 |
, "fixTurnsLimits":true |
|
49 |
, "turnsCutActivationThreashold":1.0 |
|
48 | 50 |
, "debug":debug]) |
49 | 51 |
|
50 | 52 |
//copy the emissions file in the source directory |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/BuildAFMetadataMacro.groovy (revision 3411) | ||
---|---|---|
78 | 78 |
"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)"); |
79 | 79 |
|
80 | 80 |
lineRules.put("Type de notice", "Notice sujet"); |
81 |
|
|
82 |
dateColumnsSelection.add("Date de diffusion"); |
|
83 |
datePattern = "dd/MM/yyyy" |
|
84 |
|
|
81 |
|
|
85 | 82 |
columnsToCopy.put("Identifiant de la notice", ["id"] as String[]) |
86 | 83 |
|
87 | 84 |
dateColumnsSelection.add("Date de diffusion"); |
TXM/trunk/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xml/pager.groovy (revision 3411) | ||
---|---|---|
325 | 325 |
pagedWriter.writeCharacters("\n") |
326 | 326 |
} |
327 | 327 |
pagedWriter.writeStartElement("table"); |
328 |
if (enableCollapsibles && parser.getAttributeCount() > 2) { |
|
329 |
pagedWriter.writeAttribute("class", "transcription-table collapsiblecontent") |
|
330 |
pagedWriter.writeAttribute("style", "display:none;") |
|
331 |
} else { |
|
332 |
pagedWriter.writeAttribute("class", "transcription-table"); |
|
333 |
} |
|
328 | 334 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
329 | 335 |
pagedWriter.writeStartElement("tr"); |
330 | 336 |
pagedWriter.writeElement("td", parser.getAttributeLocalName(i)); |
331 | 337 |
pagedWriter.writeElement("td", parser.getAttributeValue(i).toString()); |
332 | 338 |
pagedWriter.writeEndElement(); |
333 | 339 |
} |
334 |
if (enableCollapsibles && parser.getAttributeCount() > 2) { |
|
335 |
pagedWriter.writeAttribute("class", "transcription-table collapsiblecontent") |
|
336 |
pagedWriter.writeAttribute("style", "display:none;") |
|
337 |
} else { |
|
338 |
pagedWriter.writeAttribute("class", "transcription-table"); |
|
339 |
} |
|
340 | 340 |
|
341 | 341 |
pagedWriter.writeEndElement() // table |
342 | 342 |
pagedWriter.writeEmptyElement("br") |
TXM/trunk/org.txm.groovy.core/src/java/org/txm/groovy/core/GroovyScriptedImportEngine.java (revision 3411) | ||
---|---|---|
151 | 151 |
// Log.info(GroovyMessages.restartingToolboxSearchengines); |
152 | 152 |
// Toolbox.getEngineManager(EngineType.SEARCH).restartEngines(); |
153 | 153 |
if (project.getDoUpdate()) { |
154 |
Log.info(NLS.bind(TXMCoreMessages.corpusUpdateDoneInP0, ExecTimer.stop()));
|
|
154 |
System.out.println(NLS.bind(TXMCoreMessages.corpusUpdateDoneInP0, ExecTimer.stop()));
|
|
155 | 155 |
} |
156 | 156 |
else { |
157 |
Log.info(NLS.bind(TXMCoreMessages.corpusImportDoneInP0, ExecTimer.stop()));
|
|
157 |
System.out.println(NLS.bind(TXMCoreMessages.corpusImportDoneInP0, ExecTimer.stop()));
|
|
158 | 158 |
} |
159 | 159 |
|
160 | 160 |
} |
TXM/trunk/org.txm.annotation.kr.core/src/org/txm/annotation/kr/core/KRAnnotationEngine.java (revision 3411) | ||
---|---|---|
428 | 428 |
public void notify(TXMResult r, String state) { |
429 | 429 |
|
430 | 430 |
if (r instanceof MainCorpus && "clean".equals(state)) { |
431 |
|
|
431 | 432 |
MainCorpus c = (MainCorpus) r; |
432 | 433 |
|
433 | 434 |
if (c.getProject() == null || c.getProject().getDoUpdate()) { |
... | ... | |
441 | 442 |
if (c.getProjectDirectory() == null || !c.getProjectDirectory().exists()) { |
442 | 443 |
return; |
443 | 444 |
} |
445 |
|
|
446 |
if (ams.get(c) != null && ams.get(c).tempManager != null && ams.get(c).tempManager.getEntityManager() != null ) { |
|
447 |
ams.get(c).tempManager.close(); // free files |
|
448 |
} |
|
449 |
|
|
444 | 450 |
File buildDirectory = new File(c.getProjectDirectory(), "temporary_annotations/" + c.getID()); |
445 | 451 |
if (buildDirectory.exists()) { |
446 | 452 |
DeleteDir.deleteDirectory(buildDirectory); |
447 | 453 |
} |
448 | 454 |
} |
449 | 455 |
else if (r instanceof Project && "clean".equals(state)) { |
456 |
|
|
450 | 457 |
Project p = (Project) r; |
451 | 458 |
for (MainCorpus c : p.getChildren(MainCorpus.class)) { // if any MainCorpus is remaining, clean it |
452 | 459 |
for (String krname : KRAnnotationEngine.getKnowledgeRepositoryNames(c)) { |
TXM/trunk/org.txm.core/src/java/org/txm/objects/Project.java (revision 3411) | ||
---|---|---|
1134 | 1134 |
|
1135 | 1135 |
@Override |
1136 | 1136 |
public void clean() { |
1137 |
|
|
1137 | 1138 |
if (rcpProject != null) { |
1138 | 1139 |
try { |
1139 | 1140 |
// Toolbox.getEngineManager(EngineType.SEARCH).getEngine("CQP").stop(); //$NON-NLS-1$ |
... | ... | |
1143 | 1144 |
// Toolbox.getEngineManager(EngineType.SEARCH).getEngine("CQP").start(null); //$NON-NLS-1$ |
1144 | 1145 |
} |
1145 | 1146 |
catch (Exception e) { |
1146 |
// TODO Auto-generated catch block
|
|
1147 |
e.printStackTrace();
|
|
1147 |
Log.warning("Warning: error while deleting the corpus: "+e);
|
|
1148 |
Log.printStackTrace(e);
|
|
1148 | 1149 |
} |
1149 | 1150 |
} |
1150 | 1151 |
|
TXM/trunk/org.txm.rcp/src/main/java/org/txm/rcp/editors/imports/sections/TextualPlansSection.java (revision 3411) | ||
---|---|---|
85 | 85 |
gdata = getTextGridData(); |
86 | 86 |
milestoneElementsText.setLayoutData(gdata); |
87 | 87 |
|
88 |
Label label = toolkit.createLabel(sectionClient, "Projections", SWT.WRAP);
|
|
88 |
Label label = toolkit.createLabel(sectionClient, "CQP Structure properties projections", SWT.WRAP);
|
|
89 | 89 |
gdata = getLabelGridData(); |
90 | 90 |
gdata.colspan = 2; |
91 | 91 |
label.setLayoutData(gdata); |
Formats disponibles : Unified diff