Révision 2942

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/AddSectionsFromTableMacro.groovy (revision 2942)
1
package org.txm.macro.transcription
2

  
3
import java.nio.charset.Charset
4

  
5
import java.time.LocalTime
6
import java.time.format.DateTimeFormatter
7
import org.txm.utils.*
8

  
9
@Field @Option(name="metadataFile", usage="Tableau des metadonnées de sections", widget="FileOpen", required=true, def="")
10
File metadataFile;
11

  
12
@Field @Option(name="trsDirectory", usage="Dossier qui contient les fichiers TRS", widget="Folder", required=true, def="")
13
File trsDirectory;
14

  
15
@Field @Option(name="joinTRSColumn", usage="Colonne de jointure de transcription", widget="String", required=true, def="Lien notice principale")
16
def joinTRSColumn
17

  
18
@Field @Option(name="startTimeColumn", usage="Colonne de timing de début de section", widget="String", required=true, def="antract_debut")
19
def startTimeColumn = "antract_debut"
20

  
21
@Field @Option(name="endTimeColumn", usage="Colonne de timing de fin de section", widget="String", required=true, def="antract_fin")
22
def endTimeColumn = "antract_fin"
23

  
24
@Field @Option(name="typeColumns", usage="Colonnes des métadonnées de type de section", widget="String", required=true, def="Titre propre")
25
def typeColumns
26

  
27
@Field @Option(name="topicColumns", usage="Colonnes des métadonnées de topic de section", widget="String", required=true, def="Date de diffusion")
28
def topicColumns
29

  
30
@Field @Option(name="metadataColumns", usage="Colonnes de metadonnées de section", widget="String", required=true, def="Titre propre;Date de diffusion;Identifiant de la notice;Notes du titre;Type de date;Durée;Genre;Langue VO / VE;Nature de production;Producteurs (Aff.);Thématique;Nom fichier segmenté (info);antract_video;antract_debut;antract_fin;antract_duree;antract_tc_type;antract_tc_date;Résumé;Séquences;Descripteurs (Aff. Lig.);Générique (Aff. Lig.)")
31
def metadataColumns
32

  
33
@Field @Option(name="metadataColumnsGroups", usage="Colonnes des gruopes de metadonnées de section", widget="String", required=true, def="metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text")
34
def metadataColumnsGroups
35

  
36
if (!ParametersDialog.open(this)) return;
37
typeColumns = typeColumns.split(";")
38
topicColumns = topicColumns.split(";")
39
metadataColumns = metadataColumns.split(";")
40
metadataColumnsGroups = metadataColumnsGroups.split(";")
41

  
42
if (metadataColumns.size() != metadataColumnsGroups.size()) {
43
	println "ERROR in metadata declarations&groups:"
44
	println "COLUMNS: "+metadataColumns
45
	println "GROUPS : "+metadataColumnsGroups
46
	return
47
}
48

  
49
if (!trsDirectory.exists()) {
50
	println "$trsDirectory not found"
51
	return
52
}
53

  
54
println "Loading data from $metadataFile..."
55
TableReader reader = new TableReader(metadataFile)//, "\t".charAt(0), Charset.forName("UTF-8")
56
reader.readHeaders()
57
def header = reader.getHeaders()
58
if (!header.contains(joinTRSColumn)) {
59
	println "No TRS ID $joinTRSColumn column found"
60
	return
61
}
62
if (!header.contains(startTimeColumn)) {
63
	println "No start time $startTimeColumn column found"
64
	return
65
}
66
if (!header.contains(endTimeColumn)) {
67
	println "No end time $endTimeColumn column found"
68
	return
69
}
70
for (def col : metadataColumns) {
71
	if (!header.contains(endTimeColumn)) {
72
		println "No $col column found"
73
		return
74
	}
75
}
76
for (def col : typeColumns) {
77
	if (!header.contains(endTimeColumn)) {
78
		println "No type $col column found"
79
		return
80
	}
81
}
82
for (def col : topicColumns) {
83
	if (!header.contains(endTimeColumn)) {
84
		println "No topic $col column found"
85
		return
86
	}
87
}
88

  
89
File outputDirectory = new File(trsDirectory, "out")
90
println "Writing result to $outputDirectory..."
91

  
92
dateTimeFormatter = DateTimeFormatter.ISO_LOCAL_TIME
93
def strTotime(def str) {
94
	if (str.lastIndexOf(":") == -1) {
95
		return null
96
	}
97
	
98
	bonusFrame = Integer.parseInt(str.substring(str.lastIndexOf(":")+1))
99
	//if (str.contains("135475")) println "ERROR $str in $infos -> $bonusFrame"
100
	if (bonusFrame > 25) {
101
		bonusFrame=0;
102
	}
103
	totalFrame = str.substring(0, str.lastIndexOf(":"))
104
	
105
	LocalTime time1 = LocalTime.parse(totalFrame, dateTimeFormatter)
106
	totalFrame = (time1.getHour()*60*60) + (time1.getMinute()*60) + time1.getSecond()
107

  
108
		def ret = totalFrame + (bonusFrame/25)
109
		return ret
110
	}
111

  
112
def sectionGroupsToInsert = [:]
113
println "Reading data..."
114
while (reader.readRecord()) {
115
	String id = reader.get(joinTRSColumn).trim()
116
	if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4)
117
	if (id.length() == 0) continue;
118
		
119
	if (!sectionGroupsToInsert.containsKey(id)) {
120
		sectionGroupsToInsert[id] = []
121
	}
122
	def section = sectionGroupsToInsert[id]
123
	
124
	if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections
125
		
126
		def m = [:]
127
		
128
		for (def todo : ["topic":topicColumns, "type":typeColumns]) {
129
			def data = []
130
			for (def col : todo.value) {
131
				if (reader.get(col).trim().length() > 0) {
132
					data << reader.get(col).trim().replace("\n", "")
133
				}
134
			}
135
			m[todo.key] = data.join("\t")
136
		}
137
		def metadataList = []
138
		def metadataGroupList = []
139
		for (int i = 0 ;  i < metadataColumns.size() ; i++) {
140
			def col = metadataColumns[i]
141
			String c = AsciiUtils.buildAttributeId(col)
142
			m[c] = reader.get(col)
143
			metadataList << c
144
			metadataGroupList << metadataColumnsGroups[i]
145
		}
146
		m["metadata"] = metadataList.join("|") 
147
		m["metadata_groups"] = metadataGroupList.join("|")
148
		
149
		m["startTime"] = strTotime(reader.get(startTimeColumn))
150
		m["endTime"] = strTotime(reader.get(endTimeColumn))
151
		m["synchronized"] = "true"
152
				
153
		section << [m["startTime"], m["endTime"], m]
154
	}
155
}
156

  
157
println "Inserting sections... "+sectionGroupsToInsert.size()
158

  
159
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size())
160
for (String id : sectionGroupsToInsert.keySet()) {
161
	cpb.tick()
162
	File trsFile = new File(trsDirectory, id+".trs")
163
	if (!trsFile.exists()) {
164
		continue
165
	}
166
	//println "Processing $id..."
167
	def sections = sectionGroupsToInsert[id]
168
	sections = sections.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] }
169
	
170
	// Open input file
171
	def slurper = new groovy.util.XmlParser(false, true, true);
172
	slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) // allow DTD declaration
173
	slurper.setProperty("http://javax.xml.XMLConstants/property/accessExternalDTD", "all"); // allow to read DTD from local file
174
	def trs = slurper.parse(trsFile.toURI().toString())
175
	def trsEpisodes = trs.Episode // 1
176
	if (trsEpisodes.size() > 1) {
177
		println "multiple Episode node in $trsFile"
178
		continue
179
	}
180
	def trsEpisode = trsEpisodes[0]
181
	def trsSections =  trs.Episode.Section // 1
182
	if (trsSections.size() > 1) {
183
		println "multiple Section node in $trsFile"
184
		continue
185
	}
186
	def trsSection = trsSections[0]
187
	
188
	def turns = trsSection.Turn
189
	def newSections = []
190
	def iSection = 0;
191
	def currentSection = null
192
	def currentNode = null
193

  
194
	for (def turn : turns) {
195
		def start = Float.parseFloat(turn.@startTime)
196
		def end = Float.parseFloat(turn.@endTime)
197
		
198
		def found = null;
199
		for (int i = iSection ; i < sections.size() ; i++) {
200
			if (end < sections[i][0]) { // Turn is before section
201
				
202
			} else if (sections[i][1] < start) { // Turn is before section
203
				
204
			} else {
205
				found = sections[i]
206
				iSection = i
207
				break; // stop searching and set iSection to accelerate next search
208
			}
209
		}
210
		
211
		if (found == null) {
212
			if (currentSection != null || currentNode == null) {
213
				currentNode = new Node(trsEpisode, "Section", ["type":"Sujet non synchronisé", "startTime":turn.@startTime, "endTime":"", "synchronized":"false"] )
214
				currentSection = null;
215
			}
216
		} else {
217
			if (found != currentSection) {
218
				if (currentNode != null && currentNode.@synchronized == "false") {
219
					def tmp = currentNode.Turn
220
					currentNode.@endTime = tmp[-1].@endTime
221
				}
222
				
223
				currentSection = found
224
				currentNode = new Node(trsEpisode, "Section", currentSection[2])
225
			}
226
		}
227
		trsSection.remove(turn)
228
		currentNode.append(turn)
229
	}
230
	
231
	//remove the initial section
232
	trsEpisode.remove(trsSection)
233
	
234
	outputDirectory.mkdir()
235
	File outfile = new File(outputDirectory, trsFile.getName())
236
	outfile.withWriter("UTF-8") { writer ->
237
		writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
238
		def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer))
239
		printer.setPreserveWhitespace(true)
240
		printer.print(trs)
241
	}
242
}
243
cpb.done()
244
reader.close()
245
println "Done."
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/PrepareAFVOIXOFFCorpusMacro.groovy (revision 2942)
1
package org.txm.macro.projects.antract
2

  
3
import groovy.transform.Field
4
import org.txm.rcp.swt.widget.parameters.*
5

  
6
import org.txm.macro.transcription.*
7
import org.txm.utils.io.FileCopy
8
import org.txm.macro.projects.antract.BuildAFMetadataMacro
9
import org.txm.macro.projects.antract.BuildAFMetadataMacro
10

  
11
@Field @Option(name="tableFile", usage="Tableau des metadonnées de sections", widget="FileOpen", required=true, def="all.xlsx") // /home/mdecorde/TEMP/ANTRACT/AF/all.xlsx
12
File tableFile;
13

  
14
@Field @Option(name="trsDirectory", usage="Dossier qui contient les fichiers TRS à corriger", widget="Folder", required=true, def="AF")
15
def trsDirectory
16

  
17
if (!ParametersDialog.open(this)) return;
18

  
19
File workingDirectory = tableFile.getParentFile()
20

  
21
// extract infos for sujets and emissions from the main table file
22
gse.runMacro(BuildAFMetadataMacro, ["tableFile":tableFile, 
23
				"buildSujetsMetadata": true,
24
				"buildEmissionsMetadata": true])
25

  
26
File emissionsFile = new File(workingDirectory, "emissions.xlsx")
27
File sujetsFile = new File(workingDirectory, "sujets.xlsx")
28

  
29
// fix TRS files in the trsDirectory directory
30
gse.runMacro(FixINATRSMacro, ["trsDirectory":trsDirectory])
31

  
32
// insert the section in the TRS files
33
gse.runMacro(AddSectionsFromTableMacro, ["metadataFile": sujetsFile
34
	, "trsDirectory": trsDirectory
35
	, "joinTRSColumn": "Lien notice principale"
36
	, "startTimeColumn": "antract_debut"
37
	, "endTimeColumn": "antract_fin"
38
	, "typeColumns": "Titre propre"
39
	, "topicColumns": "Date de diffusion"
40
	, "metadataColumns": "Titre propre;Date de diffusion;Identifiant de la notice;Notes du titre;Type de date;Durée;Genre;Langue VO / VE;Nature de production;Producteurs (Aff.);Thématique;Nom fichier segmenté (info);antract_video;antract_debut;antract_fin;antract_duree;antract_tc_type;antract_tc_date;Résumé;Séquences;Descripteurs (Aff. Lig.);Générique (Aff. Lig.)"
41
	, "metadataColumnsGroups": "metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;metadata;secondary;secondary;secondary;secondary;secondary;secondary;secondary;text;text;text;text"])
42

  
43
//copy the emissions file in the source directory
44
FileCopy.copy(emissionsFile, new File(new File(trsDirectory, "out"), "metadata.xlsx"))
45

  
46
//done \o/
47
println "Done: import to finalize using the "+new File(trsDirectory, "out")+" directory."
48

  
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/BuildAFMetadataMacro.groovy (revision 2942)
1
package org.txm.macro.projects.antract
2

  
3
import java.io.File
4
import java.nio.charset.Charset
5

  
6
import java.time.LocalTime
7
import java.time.format.DateTimeFormatter
8
import java.util.Arrays
9
import java.util.HashMap
10
import java.util.List
11

  
12
import org.txm.libs.msoffice.ReadExcel
13
import org.txm.utils.*
14

  
15
@Field @Option(name="tableFile", usage="Tableau des metadonnées de sections", widget="FileOpen", required=true, def="all.xlsx") // /home/mdecorde/TEMP/ANTRACT/AF/all.xlsx
16
File tableFile;
17

  
18
@Field @Option(name="buildSujetsMetadata", usage="Build the sujets metadata", widget="Boolean", required=true, def="true")
19
boolean buildSujetsMetadata;
20

  
21
@Field @Option(name="buildEmissionsMetadata", usage="Build the emissions metadata", widget="Boolean", required=true, def="true")
22
boolean buildEmissionsMetadata;
23

  
24
if (!ParametersDialog.open(this)) return;
25

  
26
File table2File = null;
27

  
28
System.out.println("opening $tableFile...");
29
ReadExcel excel = new ReadExcel(tableFile, null);
30

  
31
if (buildEmissionsMetadata) {
32
	//emissions
33
	table2File = new File(tableFile.getParentFile(), "emissions.xlsx");
34
	HashMap<String, String> lineRules = new HashMap<>(); // line tests to select line to keep
35
	List<String> columnsSelection; // list of columns to keep
36
	HashMap<String, String> columnsToCopy = new HashMap<>();
37
	HashMap<String, String> columnsToRenameRules = new HashMap<>();
38
	HashMap<String, String[]> searchAndReplaceRules = new HashMap<>();
39
	
40
	columnsSelection = Arrays.asList(
41
	"Identifiant de la notice", "Titre propre", "Notes du titre", "Date de diffusion", "Durée", "Nom fichier segmenté (info)", "antract_video",
42
	"antract_debut","antract_fin","antract_duree","antract_tc_type","antract_tc_date");
43

  
44
	lineRules.put("Type de notice", "Notice sommaire");
45
	
46
	columnsToRenameRules.put("Identifiant de la notice", "id");
47
	
48
	columnsToCopy.put("Notes du titre", "subtitle");
49
	columnsToCopy.put("Titre propre", "title"); 
50
	columnsToCopy.put("Date de diffusion", "textorder");
51
	columnsToCopy.put("Date de diffusion", "annee");
52
	
53
	searchAndReplaceRules.put("textorder", ["([0-9][0-9])/([0-9][0-9])/([0-9][0-9][0-9][0-9])", '$3$2$1'] as String[]); // not working yet
54
	searchAndReplaceRules.put("annee", ["([0-9][0-9])/([0-9][0-9])/([0-9][0-9][0-9][0-9])", '$3'] as String[]); // not working yet
55
	
56
	process(excel, table2File, lineRules, columnsSelection, columnsToCopy, searchAndReplaceRules, columnsToRenameRules)
57
}
58

  
59
if (buildSujetsMetadata) {
60
	// sujets
61
	table2File = new File(tableFile.getParentFile(), "sujets.xlsx");
62
	HashMap<String, String> lineRules = new HashMap<>(); // line tests to select line to keep
63
	List<String> columnsSelection; // list of columns to keep
64
	HashMap<String, String> columnsToCopy = new HashMap<>();
65
	HashMap<String, String> columnsToRenameRules = new HashMap<>();
66
	HashMap<String, String[]> searchAndReplaceRules = new HashMap<>();
67
	
68
	columnsSelection = Arrays.asList(
69
		"Identifiant de la notice", "Titre propre", "Notes du titre", "Lien notice principale",
70
		"Date de diffusion", "Type de date", "Durée", "Genre", "Langue VO / VE", "Nature de production", "Producteurs (Aff.)", "Thématique",
71
		"Nom fichier segmenté (info)", "antract_video", "antract_debut", "antract_fin", "antract_duree", "antract_tc_type", "antract_tc_date",
72
		"Résumé", "Séquences", "Descripteurs (Aff. Lig.)", "Générique (Aff. Lig.)");
73
	
74
	lineRules.put("Type de notice", "Notice sujet");
75
	
76
	process(excel, table2File, lineRules, columnsSelection, columnsToCopy, searchAndReplaceRules, columnsToRenameRules)
77
}
78

  
79
def process(ReadExcel excel, File table2File, def lineRules, def columnsSelection, def columnsToCopy, def searchAndReplaceRules, def columnsToRenameRules) {
80
	System.out.println("Writing: $table2File");
81
	
82
	table2File.delete();
83
	ReadExcel excel2 = new ReadExcel(table2File, null);
84
	println " Selecting $columnsSelection with lines matching $lineRules"
85
	if (!excel.extractTo(excel2, lineRules, columnsSelection)) {
86
		System.out.println("FAIL");
87
		return;
88
	}
89
	
90
	if (columnsToCopy.size() > 0) {
91
		System.out.println(" Copying column: " + columnsToCopy);
92
		excel2.copyColumns(columnsToCopy);
93
		println " WARNING: ReadExcel.copyColumns() not implemented"
94
	}
95
	
96
	if (searchAndReplaceRules.size() > 0) {
97
		System.out.println(" Search&replace column: " + searchAndReplaceRules);
98
		excel2.searchAndReplaceInLines(searchAndReplaceRules);
99
	}
100
	
101
	if (columnsToRenameRules.size() > 0) {
102
		System.out.println(" Renaming column: " + columnsToRenameRules);
103
		excel2.renameColumns(columnsToRenameRules);
104
	}
105
	
106
	System.out.println(" Saving&Closing...");
107
	excel2.save();
108
	excel2.close();
109
	excel.close();
110
	System.out.println("Done: $table2File");
111
	
112
	return table2File.exists()
113
}
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/projects/antract/FixINATRSMacro.groovy (revision 2942)
1
package org.txm.macro.projects.antract
2

  
3
import org.txm.utils.ConsoleProgressBar
4
import org.kohsuke.args4j.*
5
import groovy.transform.Field
6
import org.txm.rcp.swt.widget.parameters.*
7

  
8
@Field @Option(name="trsDirectory", usage="Dossier qui contient les fichiers TRS à corriger", widget="Folder", required=true, def="AF")
9
def trsDirectory
10

  
11
if (!ParametersDialog.open(this)) return;
12

  
13
println "Fixing $trsDirectory"
14
def files = trsDirectory.listFiles()
15
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
16
for (File trsFile : files) {
17
//File trsFile = new File(directory, "AFE86004868.trs")
18
	cpb.tick()
19
	if (!trsFile.getName().endsWith(".trs")) {
20
		continue;
21
	}
22
	String content = trsFile.getText("UTF-8")
23
	content = content.replaceAll("punct=\"([^\"]+)\"\">", "punct=\"\$1\">")
24
	content = content.replaceAll("<unk>", "???")
25
	content = content.replaceAll(" Time=\"", " time=\"")
26
	trsFile.setText(content, "UTF-8")
27
}
28
cpb.done()
29
println "Done"

Formats disponibles : Unified diff