Révision 2259
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/csv/Table2CorpusMacro.groovy (revision 2259) | ||
---|---|---|
1 |
package org.txm.macro.csv |
|
2 |
|
|
3 |
import org.kohsuke.args4j.* |
|
4 |
import groovy.transform.Field |
|
5 |
import java.nio.charset.Charset |
|
6 |
import org.txm.rcp.swt.widget.parameters.* |
|
7 |
import org.txm.utils.* |
|
8 |
import javax.xml.stream.* |
|
9 |
import java.net.URL |
|
10 |
|
|
11 |
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx") |
|
12 |
File inputFile; |
|
13 |
|
|
14 |
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory") |
|
15 |
File outputDirectory; |
|
16 |
|
|
17 |
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire") |
|
18 |
def textSelector; |
|
19 |
|
|
20 |
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet") |
|
21 |
def structureSelector; |
|
22 |
|
|
23 |
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice") |
|
24 |
def textIDColumn; |
|
25 |
|
|
26 |
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale") |
|
27 |
def joinColumn; |
|
28 |
|
|
29 |
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
30 |
def textMetadataColumnList; |
|
31 |
|
|
32 |
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
33 |
def textContentColumnList; |
|
34 |
|
|
35 |
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div") |
|
36 |
def structureTag; |
|
37 |
|
|
38 |
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)") |
|
39 |
def structureMetadataColumnList; |
|
40 |
|
|
41 |
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
42 |
def structureContentColumnList; |
|
43 |
|
|
44 |
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
45 |
def metadataDateColumnList; |
|
46 |
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
47 |
def prefixContentColumnList; |
|
48 |
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="") |
|
49 |
def listContentColumnList; |
|
50 |
|
|
51 |
|
|
52 |
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences") |
|
53 |
// def structureOrderColumn; |
|
54 |
|
|
55 |
if (!ParametersDialog.open(this)) return; |
|
56 |
|
|
57 |
textMetadataColumnList = textMetadataColumnList.split(",") |
|
58 |
textContentColumnList = textContentColumnList.split(",") |
|
59 |
structureMetadataColumnList = structureMetadataColumnList.split(",") |
|
60 |
structureContentColumnList = structureContentColumnList.split(",") |
|
61 |
metadataDateColumnList = metadataDateColumnList.split(",") |
|
62 |
prefixContentColumnList = prefixContentColumnList.split(",") |
|
63 |
listContentColumnList = listContentColumnList.split(",") |
|
64 |
int ti = textSelector.indexOf("=") |
|
65 |
String p1 = textSelector.substring(0, ti) |
|
66 |
String p2 = textSelector.substring(ti+1) |
|
67 |
textSelector = [p1, p2] |
|
68 |
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)] |
|
69 |
|
|
70 |
|
|
71 |
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn" |
|
72 |
println "textMetadataColumnList columns: $textMetadataColumnList" |
|
73 |
println "textContentColumnList columns: $textContentColumnList" |
|
74 |
println "structureMetadataColumnList columns: $structureMetadataColumnList" |
|
75 |
println "structureContentColumnList columns: $structureContentColumnList" |
|
76 |
|
|
77 |
println "text selector="+textSelector |
|
78 |
println "structure selector="+structureSelector |
|
79 |
println "structureTag="+structureTag |
|
80 |
|
|
81 |
TableReader reader = new TableReader(inputFile); |
|
82 |
if (!reader.readHeaders()) { |
|
83 |
println "Error: no header" |
|
84 |
return |
|
85 |
} |
|
86 |
def headers = Arrays.asList(reader.getHeaders()) |
|
87 |
println "table columns: $headers" |
|
88 |
|
|
89 |
def ok = true |
|
90 |
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList, |
|
91 |
"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList, |
|
92 |
"structureContentColumnList":structureContentColumnList]) { |
|
93 |
for (String m : list.value) { |
|
94 |
m = m.trim() |
|
95 |
if (!headers.contains(m)) { |
|
96 |
println "Error: missing ${list.key} column: $m" |
|
97 |
ok = false |
|
98 |
} |
|
99 |
} |
|
100 |
} |
|
101 |
if (!ok) { return; } |
|
102 |
|
|
103 |
// group by text |
|
104 |
def texts = new LinkedHashMap() |
|
105 |
def nRecord = 0 |
|
106 |
while (reader.readRecord()) { |
|
107 |
nRecord++ |
|
108 |
//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0]) |
|
109 |
|
|
110 |
String id = reader.get(textIDColumn).trim() |
|
111 |
String join = reader.get(joinColumn).trim() |
|
112 |
String textSelectorValue = reader.get(textSelector[0]).trim() |
|
113 |
String structureSelectorValue = reader.get(structureSelector[0]).trim() |
|
114 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) { |
|
115 |
if (!texts.containsKey(id)) texts[id] = [] |
|
116 |
texts[id].add(0, reader.getRecord()) |
|
117 |
} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) { |
|
118 |
if (!texts.containsKey(join)) texts[join] = [] |
|
119 |
texts[join].add(reader.getRecord()) |
|
120 |
} else { |
|
121 |
// ignore |
|
122 |
} |
|
123 |
} |
|
124 |
println "N lines: "+nRecord |
|
125 |
println "N groups: "+texts.size() |
|
126 |
if (texts.size() == 0) { |
|
127 |
println "No text found. Aborting." |
|
128 |
return |
|
129 |
} |
|
130 |
outputDirectory.mkdir() |
|
131 |
|
|
132 |
for (def id : texts.keySet()) { |
|
133 |
def toWrite = texts[id] |
|
134 |
def text = toWrite[0] |
|
135 |
String textSelectorValue = text.get(textSelector[0]).trim() |
|
136 |
if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) { |
|
137 |
println "Processing text: $id" |
|
138 |
|
|
139 |
File outputfile = new File(outputDirectory, id+".xml") |
|
140 |
XMLOutputFactory factory = XMLOutputFactory.newInstance() |
|
141 |
FileOutputStream output = new FileOutputStream(outputfile) |
|
142 |
XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8") |
|
143 |
|
|
144 |
writer.writeStartDocument("UTF-8","1.0") |
|
145 |
writer.writeCharacters("\n") // simple XML formating |
|
146 |
writer.writeStartElement("TEI") |
|
147 |
writer.writeCharacters("\n") // simple XML formating |
|
148 |
writer.writeStartElement("teiHeader") |
|
149 |
writer.writeEndElement() // teiHeader |
|
150 |
writer.writeCharacters("\n") // simple XML formating |
|
151 |
writer.writeStartElement("text") |
|
152 |
writer.writeAttribute("id", id) |
|
153 |
|
|
154 |
for (String att : textMetadataColumnList) { |
|
155 |
if (att in metadataDateColumnList) { |
|
156 |
writeMetadataDate(text, att, writer) |
|
157 |
} else { |
|
158 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct |
|
159 |
} |
|
160 |
} |
|
161 |
|
|
162 |
writer.writeCharacters("\n") // simple XML formating |
|
163 |
|
|
164 |
writer.writeStartElement("metadata") |
|
165 |
writer.writeStartElement("list") |
|
166 |
writer.writeAttribute("type", "unordered") |
|
167 |
writer.writeCharacters("\n") |
|
168 |
textMetadataColumnList.each { att -> |
|
169 |
writer.writeStartElement("item") |
|
170 |
writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";")) |
|
171 |
writer.writeEndElement() // item |
|
172 |
writer.writeCharacters("\n") |
|
173 |
} |
|
174 |
writer.writeEndElement() // list |
|
175 |
writer.writeEndElement() // metadata |
|
176 |
writer.writeCharacters("\n") |
|
177 |
|
|
178 |
for (String att : textContentColumnList) { |
|
179 |
if (att in prefixContentColumnList) { |
|
180 |
writeContentPrefix(text, att, writer) |
|
181 |
} else if (att in listContentColumnList) { |
|
182 |
writeContentList(text, att, writer) |
|
183 |
} else { |
|
184 |
writer.writeStartElement("p"); |
|
185 |
writer.writeAttribute("type", att.trim()) |
|
186 |
writer.writeCharacters("\n") // simple XML formating |
|
187 |
|
|
188 |
writer.writeStartElement("head") |
|
189 |
writer.writeStartElement("hi") |
|
190 |
writer.writeCharacters(att+" : ") |
|
191 |
writer.writeEndElement() // hi |
|
192 |
writer.writeEndElement() // head |
|
193 |
|
|
194 |
writer.writeCharacters(text.get(att)) // get textColumnList content |
|
195 |
writer.writeEndElement() // t |
|
196 |
writer.writeCharacters("\n") // simple XML formating) |
|
197 |
} |
|
198 |
} |
|
199 |
|
|
200 |
int pb_n = 1; |
|
201 |
for (int i = 1 ; i < toWrite.size() ; i++) { |
|
202 |
def record = toWrite[i] |
|
203 |
|
|
204 |
writer.writeEmptyElement("pb") // <pb/> |
|
205 |
writer.writeAttribute("n", ""+pb_n++) |
|
206 |
|
|
207 |
writer.writeStartElement(structureTag) |
|
208 |
|
|
209 |
for (String att : structureMetadataColumnList) { |
|
210 |
if (att in metadataDateColumnList) { |
|
211 |
writeMetadataDate(record, att, writer) |
|
212 |
} else { |
|
213 |
writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct |
|
214 |
} |
|
215 |
} |
|
216 |
writer.writeCharacters("\n") |
|
217 |
|
|
218 |
writer.writeStartElement("metadata") |
|
219 |
writer.writeStartElement("list") |
|
220 |
writer.writeAttribute("type", "unordered") |
|
221 |
writer.writeCharacters("\n") |
|
222 |
structureMetadataColumnList.each { att -> |
|
223 |
writer.writeStartElement("item") |
|
224 |
writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";")) |
|
225 |
writer.writeEndElement() // item |
|
226 |
writer.writeCharacters("\n") |
|
227 |
} |
|
228 |
writer.writeEndElement() // list |
|
229 |
writer.writeEndElement() // metadata |
|
230 |
writer.writeCharacters("\n") |
|
231 |
|
|
232 |
for (String att : structureContentColumnList) { |
|
233 |
if (att in prefixContentColumnList) { |
|
234 |
writeContentPrefix(text, att, writer) |
|
235 |
} else if (att in listContentColumnList) { |
|
236 |
writeContentList(text, att, writer) |
|
237 |
} else { |
|
238 |
writer.writeStartElement("p"); |
|
239 |
writer.writeAttribute("type", att.trim()) |
|
240 |
writer.writeAttribute("id", record.get(textIDColumn)) |
|
241 |
writer.writeCharacters("\n") // simple XML formating |
|
242 |
|
|
243 |
writer.writeStartElement("head") |
|
244 |
writer.writeStartElement("hi") |
|
245 |
writer.writeCharacters(att+" : ") |
|
246 |
writer.writeEndElement() // hi |
|
247 |
writer.writeEndElement() // head |
|
248 |
|
|
249 |
writer.writeCharacters(record.get(att)) // get textColumnList content |
|
250 |
writer.writeEndElement() // t |
|
251 |
writer.writeCharacters("\n") // simple XML formating |
|
252 |
} |
|
253 |
} |
|
254 |
|
|
255 |
writer.writeEndElement() // struct |
|
256 |
writer.writeCharacters("\n") // simple XML formating |
|
257 |
} |
|
258 |
|
|
259 |
writer.writeEndElement() // text |
|
260 |
writer.writeCharacters("\n") // simple XML formating |
|
261 |
writer.writeEndElement() // TEI |
|
262 |
writer.close() |
|
263 |
output.close() |
|
264 |
reader.close() |
|
265 |
} else { |
|
266 |
// error |
|
267 |
println "ERROR: '$id' text group with no text line" |
|
268 |
} |
|
269 |
} |
|
270 |
|
|
271 |
def writeContentList(def record, def att, def writer) { |
|
272 |
writer.writeCharacters("\n") |
|
273 |
found = false |
|
274 |
def value = record.get(att) |
|
275 |
value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc -> |
|
276 |
found = true |
|
277 |
writer.writeCharacters("\t") |
|
278 |
writer.writeStartElement("p") |
|
279 |
writer.writeAttribute("rend", "list") |
|
280 |
matches = (desc =~ /(?s)([^\n]+?)[\n]/) |
|
281 |
writer.writeCharacters(matches[0][1]) |
|
282 |
writer.writeEndElement() // p |
|
283 |
writer.writeCharacters("\n") |
|
284 |
} |
|
285 |
if (!found) { |
|
286 |
writer.writeCharacters("\t") |
|
287 |
writer.writeStartElement("p") |
|
288 |
writer.writeAttribute("rend", "no-list") |
|
289 |
writer.writeCharacters(value) |
|
290 |
writer.writeEndElement() // p |
|
291 |
writer.writeCharacters("\n") |
|
292 |
} |
|
293 |
} |
|
294 |
|
|
295 |
def writeContentPrefix(def record, def att, def writer) { |
|
296 |
writer.writeCharacters("\n") |
|
297 |
writer.writeStartElement("list") |
|
298 |
writer.writeAttribute("rend", "prefixes") |
|
299 |
writer.writeAttribute("type", "unordered") |
|
300 |
writer.writeCharacters("\n") |
|
301 |
found = false |
|
302 |
def value = record.get(att) |
|
303 |
value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc -> |
|
304 |
found = true |
|
305 |
writer.writeCharacters("\t") |
|
306 |
writer.writeStartElement("item") |
|
307 |
matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/) |
|
308 |
writer.writeAttribute("type", matches[0][1]) |
|
309 |
writer.writeStartElement("span") |
|
310 |
writer.writeCharacters(matches[0][1]+" ") |
|
311 |
writer.writeEndElement() // span |
|
312 |
writer.writeCharacters(matches[0][2]) |
|
313 |
writer.writeEndElement() // item |
|
314 |
writer.writeCharacters("\n") |
|
315 |
} |
|
316 |
if (!found) { |
|
317 |
writer.writeCharacters(value) |
|
318 |
} |
|
319 |
writer.writeEndElement() // list |
|
320 |
writer.writeCharacters("\n") |
|
321 |
} |
|
322 |
|
|
323 |
def writeMetadataDate(def record, def att, def writer) { |
|
324 |
String value = record.get(att) |
|
325 |
String att_normalized = AsciiUtils.buildAttributeId(att) |
|
326 |
matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/) |
|
327 |
writer.writeAttribute(att_normalized+"jour", matches[0][1]) |
|
328 |
writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value))) |
|
329 |
writer.writeAttribute(att_normalized+"mois", matches[0][2]) |
|
330 |
writer.writeAttribute(att_normalized+"annee", matches[0][3]) |
|
331 |
writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1]) |
|
332 |
} |
|
333 |
|
|
334 |
/* |
|
335 |
String name = inputFile.getName() |
|
336 |
int idx = name.lastIndexOf(".") |
|
337 |
if (idx > 0) name = name.substring(0, idx) |
|
338 |
*/ |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2259) | ||
---|---|---|
351 | 351 |
writer.writeAttribute("class", "section"); |
352 | 352 |
String type = parser.getAttributeValue(null,"type"); |
353 | 353 |
String desc = parser.getAttributeValue(null,"topic"); |
354 |
if (type != null && desc != null) { |
|
354 |
String metadata = parser.getAttributeValue(null,"metadata"); |
|
355 |
if (type != null || desc != null) { |
|
355 | 356 |
writer.writeStartElement("h3"); |
356 |
writer.writeCharacters(type+": "+desc); |
|
357 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
358 |
String name = parser.getAttributeLocalName(i); |
|
359 |
if (!"type".equals(name) |
|
360 |
&& !"topic".equals(name) |
|
361 |
&& !"startTime".equals(name) |
|
362 |
&& !"endTime".equals(name)) { |
|
363 |
writer.writeCharacters(", "+name+": "+parser.getAttributeValue(i)); |
|
357 |
if (type != null || type.length() ==0) { |
|
358 |
writer.writeCharacters(type+": "+desc); |
|
359 |
} else { |
|
360 |
writer.writeCharacters(desc); |
|
361 |
} |
|
362 |
writer.writeEndElement(); // h3 |
|
363 |
|
|
364 |
if (metadata != null && metadata.length() > 0) { // the metadata to show |
|
365 |
writer.writeStartElement("ul"); |
|
366 |
for (def m : metadata.split("\t")) { |
|
367 |
writer.writeStartElement("li"); |
|
368 |
writer.writeCharacters(m); |
|
369 |
writer.writeEndElement(); // li |
|
364 | 370 |
} |
371 |
writer.writeEndElement(); // ul |
|
372 |
} else if (parser.getAttributeCount() > 1) { // process all attributes |
|
373 |
writer.writeStartElement("ul"); |
|
374 |
for (int i = 0 ; i < parser.getAttributeCount() ; i++) { |
|
375 |
String name = parser.getAttributeLocalName(i); |
|
376 |
if (!"type".equals(name) |
|
377 |
&& !"topic".equals(name) |
|
378 |
&& !"startTime".equals(name) |
|
379 |
&& !"endTime".equals(name)) { |
|
380 |
writer.writeStartElement("li"); |
|
381 |
writer.writeCharacters(""+name+": "+parser.getAttributeValue(i)); |
|
382 |
writer.writeEndElement(); // li |
|
383 |
} |
|
384 |
} |
|
385 |
writer.writeEndElement(); // ul |
|
365 | 386 |
} |
366 |
writer.writeEndElement(); // h3 |
|
367 | 387 |
} |
368 | 388 |
break; |
369 | 389 |
case "sp": |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/FixINAXML.groovy (revision 2259) | ||
---|---|---|
1 |
import org.txm.utils.ConsoleProgressBar |
|
2 |
|
|
3 |
def directory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs") |
|
4 |
println "Fixing $directory" |
|
5 |
def files = directory.listFiles() |
|
6 |
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size()) |
|
7 |
for (File trsFile : files) { |
|
8 |
//File trsFile = new File(directory, "AFE86004868.trs") |
|
9 |
cpb.tick() |
|
10 |
if (!trsFile.getName().endsWith(".trs")) { |
|
11 |
continue; |
|
12 |
} |
|
13 |
String content = trsFile.getText("UTF-8") |
|
14 |
content = content.replaceAll("punct=\"([^\"]+)\"\">", "punct=\"\$1\">") |
|
15 |
content = content.replaceAll("<unk>", "???") |
|
16 |
content = content.replaceAll(" Time=\"", " time=\"") |
|
17 |
trsFile.setText(content, "UTF-8") |
|
18 |
} |
|
19 |
cpb.done() |
|
20 |
println "Done" |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/AddSections.groovy (revision 2259) | ||
---|---|---|
1 |
import java.nio.charset.Charset |
|
2 |
|
|
3 |
import org.txm.utils.AsciiUtils |
|
4 |
import org.txm.utils.ConsoleProgressBar |
|
5 |
import org.txm.utils.CsvReader |
|
6 |
import java.time.LocalTime |
|
7 |
import java.time.format.DateTimeFormatter |
|
8 |
|
|
9 |
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/metadata.tsv") |
|
10 |
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs") |
|
11 |
def idTRSColumn = "Lien notice principale" |
|
12 |
def typeColumns = ["Identifiant de la notice"] |
|
13 |
def topicColumns = ["Titre propre", "Genre", "antract_debut", "antract_fin"] |
|
14 |
def startTimeColumn = "antract_debut" |
|
15 |
def endTimeColumn = "antract_fin" |
|
16 |
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"] |
|
17 |
def metadataColumns = ["Date de diffusion", "Descripteurs (Aff. Col.)", "Durée", "Générique (Aff. Col.)", "Genre", "Identifiant de la notice", "Langue VO / VE ", "Lien notice principale", "Nature de production ", "Nom fichier segmenté (info)", "Notes du titre ", "Producteurs (Aff.)", "Résumé", "Séquences", "Thématique", "Titre propre", "Type de date", "Type de notice"] |
|
18 |
|
|
19 |
if (!trsDirectory.exists()) { |
|
20 |
println "$trsDirectory not found" |
|
21 |
return |
|
22 |
} |
|
23 |
|
|
24 |
|
|
25 |
CsvReader reader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8")) |
|
26 |
reader.readHeaders() |
|
27 |
def header = reader.getHeaders() |
|
28 |
if (!header.contains(idTRSColumn)) { |
|
29 |
println "No TRS ID $idTRSColumn column found" |
|
30 |
return |
|
31 |
} |
|
32 |
if (!header.contains(startTimeColumn)) { |
|
33 |
println "No start time $startTimeColumn column found" |
|
34 |
return |
|
35 |
} |
|
36 |
if (!header.contains(endTimeColumn)) { |
|
37 |
println "No end time $endTimeColumn column found" |
|
38 |
return |
|
39 |
} |
|
40 |
for (def col : metadataColumns) { |
|
41 |
if (!header.contains(endTimeColumn)) { |
|
42 |
println "No $col column found" |
|
43 |
return |
|
44 |
} |
|
45 |
} |
|
46 |
for (def col : typeColumns) { |
|
47 |
if (!header.contains(endTimeColumn)) { |
|
48 |
println "No type $col column found" |
|
49 |
return |
|
50 |
} |
|
51 |
} |
|
52 |
for (def col : topicColumns) { |
|
53 |
if (!header.contains(endTimeColumn)) { |
|
54 |
println "No topic $col column found" |
|
55 |
return |
|
56 |
} |
|
57 |
} |
|
58 |
|
|
59 |
File outputDirectory = new File(trsDirectory, "out") |
|
60 |
println "Writing result to $outputDirectory..." |
|
61 |
|
|
62 |
dateTimeFormatter = DateTimeFormatter.ISO_LOCAL_TIME |
|
63 |
def strTotime(def str) { |
|
64 |
if (str.lastIndexOf(":") == -1) { |
|
65 |
return null |
|
66 |
} |
|
67 |
|
|
68 |
bonusFrame = Integer.parseInt(str.substring(str.lastIndexOf(":")+1)) |
|
69 |
//if (str.contains("135475")) println "ERROR $str in $infos -> $bonusFrame" |
|
70 |
if (bonusFrame > 25) { |
|
71 |
bonusFrame=0; |
|
72 |
} |
|
73 |
totalFrame = str.substring(0, str.lastIndexOf(":")) |
|
74 |
|
|
75 |
LocalTime time1 = LocalTime.parse(totalFrame, dateTimeFormatter) |
|
76 |
totalFrame = (time1.getHour()*60*60) + (time1.getMinute()*60) + time1.getSecond() |
|
77 |
|
|
78 |
def ret = totalFrame + (bonusFrame/25) |
|
79 |
return ret |
|
80 |
} |
|
81 |
|
|
82 |
def sectionGroupsToInsert = [:] |
|
83 |
while (reader.readRecord()) { |
|
84 |
String id = reader.get(idTRSColumn).trim() |
|
85 |
if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4) |
|
86 |
if (id.length() == 0) continue; |
|
87 |
|
|
88 |
if (!sectionGroupsToInsert.containsKey(id)) { |
|
89 |
sectionGroupsToInsert[id] = [] |
|
90 |
} |
|
91 |
def section = sectionGroupsToInsert[id] |
|
92 |
|
|
93 |
|
|
94 |
if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections |
|
95 |
|
|
96 |
def m = [:] |
|
97 |
|
|
98 |
for (def todo : ["metadata":metadataColumns, "topic":topicColumns, "type":typeColumns]) { |
|
99 |
def data = [] |
|
100 |
for (def col : todo.value) { |
|
101 |
if (reader.get(col).trim().length() > 0) { |
|
102 |
data << col+": "+reader.get(col).trim().replace("\n", "") |
|
103 |
} |
|
104 |
} |
|
105 |
m[todo.key] = data.join("\t") |
|
106 |
} |
|
107 |
|
|
108 |
m["startTime"] = strTotime(reader.get(startTimeColumn)) |
|
109 |
m["endTime"] = strTotime(reader.get(endTimeColumn)) |
|
110 |
m["synchronized"] = "true" |
|
111 |
|
|
112 |
section << [m["startTime"], m["endTime"], m] |
|
113 |
} |
|
114 |
} |
|
115 |
|
|
116 |
println "N sections: "+sectionGroupsToInsert.size() |
|
117 |
|
|
118 |
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size()) |
|
119 |
for (String id : sectionGroupsToInsert.keySet()) { |
|
120 |
cpb.tick() |
|
121 |
File trsFile = new File(trsDirectory, id+".trs") |
|
122 |
if (!trsFile.exists()) { |
|
123 |
continue |
|
124 |
} |
|
125 |
//println "Processing $id..." |
|
126 |
def sections = sectionGroupsToInsert[id] |
|
127 |
sections = sections.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] } |
|
128 |
|
|
129 |
// Open input file |
|
130 |
def slurper = new XmlParser(); |
|
131 |
slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false) |
|
132 |
|
|
133 |
def trs = slurper.parse(trsFile.toURI().toString()) |
|
134 |
def trsEpisodes = trs.Episode // 1 |
|
135 |
if (trsEpisodes.size() > 1) { |
|
136 |
println "multiple Episode node in $trsFile" |
|
137 |
continue |
|
138 |
} |
|
139 |
def trsEpisode = trsEpisodes[0] |
|
140 |
def trsSections = trs.Episode.Section // 1 |
|
141 |
if (trsSections.size() > 1) { |
|
142 |
println "multiple Section node in $trsFile" |
|
143 |
continue |
|
144 |
} |
|
145 |
def trsSection = trsSections[0] |
|
146 |
|
|
147 |
def turns = trsSection.Turn |
|
148 |
def newSections = [] |
|
149 |
def iSection = 0; |
|
150 |
def currentSection = null |
|
151 |
def currentNode = null |
|
152 |
|
|
153 |
for (def turn : turns) { |
|
154 |
def start = Float.parseFloat(turn.@startTime) |
|
155 |
def end = Float.parseFloat(turn.@endTime) |
|
156 |
|
|
157 |
def found = null; |
|
158 |
for (int i = iSection ; i < sections.size() ; i++) { |
|
159 |
if (end < sections[i][0]) { // Turn is before section |
|
160 |
|
|
161 |
} else if (sections[i][1] < start) { // Turn is before section |
|
162 |
|
|
163 |
} else { |
|
164 |
found = sections[i] |
|
165 |
iSection = i |
|
166 |
break; // stop searching and set iSection to accelerate next search |
|
167 |
} |
|
168 |
} |
|
169 |
|
|
170 |
if (found == null) { |
|
171 |
if (currentSection != null || currentNode == null) { |
|
172 |
currentNode = new Node(trsEpisode, "Section", ["type":"non synchronisée", "topic":"none", "startTime":turn.@startTime, "endTime":"", "synchronized":"false"] ) |
|
173 |
currentSection = null; |
|
174 |
} |
|
175 |
} else { |
|
176 |
if (found != currentSection) { |
|
177 |
if (currentNode != null && currentNode.@synchronized == "false") { |
|
178 |
def tmp = currentNode.Turn |
|
179 |
currentNode.@endTime = tmp[-1].@endTime |
|
180 |
} |
|
181 |
|
|
182 |
currentSection = found |
|
183 |
currentNode = new Node(trsEpisode, "Section", currentSection[2]) |
|
184 |
//trsEpisode.appendNode(currentNode) |
|
185 |
} |
|
186 |
} |
|
187 |
trsSection.remove(turn) |
|
188 |
currentNode.append(turn) |
|
189 |
} |
|
190 |
|
|
191 |
//remove the initial section |
|
192 |
trsEpisode.remove(trsSection) |
|
193 |
|
|
194 |
outputDirectory.mkdir() |
|
195 |
File outfile = new File(outputDirectory, trsFile.getName()) |
|
196 |
outfile.withWriter("UTF-8") { writer -> |
|
197 |
writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n') |
|
198 |
def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer)) |
|
199 |
printer.setPreserveWhitespace(true) |
|
200 |
printer.print(trs) |
|
201 |
} |
|
202 |
} |
|
203 |
cpb.done() |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 2259) | ||
---|---|---|
54 | 54 |
import org.txm.utils.*; |
55 | 55 |
import org.txm.metadatas.*; |
56 | 56 |
|
57 |
// TODO: Auto-generated Javadoc |
|
58 | 57 |
/** |
59 | 58 |
* The Class importer. |
60 | 59 |
*/ |
... | ... | |
116 | 115 |
|
117 | 116 |
// TRS -> TEI |
118 | 117 |
println "Converting TRS to TEI "+trsfiles.size()+" files" |
118 |
ConsoleProgressBar cpb = new ConsoleProgressBar(trsfiles.size()) |
|
119 | 119 |
for (File infile : trsfiles) { |
120 |
print "."
|
|
120 |
cpb.tick()
|
|
121 | 121 |
String textid = infile.getName() |
122 | 122 |
int idx = textid.indexOf(".trs") |
123 | 123 |
if (idx > 0) textid = textid.substring(0, idx) |
... | ... | |
128 | 128 |
//return false; |
129 | 129 |
} |
130 | 130 |
} |
131 |
println ""
|
|
131 |
cpb.done()
|
|
132 | 132 |
|
133 | 133 |
if (metadatas != null) { |
134 | 134 |
if (metadatas.getHeadersList().size() > 0) { |
... | ... | |
137 | 137 |
|
138 | 138 |
trsfiles = txmDir.listFiles(); |
139 | 139 |
trsfiles.sort() |
140 |
cpb = new ConsoleProgressBar(trsfiles.size()) |
|
140 | 141 |
for (File infile : trsfiles) { |
141 | 142 |
File outfile = new File(txmDir, "tmp.xml") |
142 | 143 |
if (metadatas != null && metadatas.isInitialized()) { |
143 |
print "."
|
|
144 |
cpb.tick()
|
|
144 | 145 |
if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) { |
145 | 146 |
println("Failed to inject metadata in "+infile) |
146 | 147 |
outfile.delete() |
... | ... | |
152 | 153 |
outfile.renameTo(infile) |
153 | 154 |
} |
154 | 155 |
} |
156 |
cpb.done() |
|
155 | 157 |
} |
156 | 158 |
} |
157 | 159 |
|
158 |
println "" |
|
159 | 160 |
|
160 | 161 |
// TOKENIZER ENTITIES |
161 |
println "Tokenizing entities "+txmDir.listFiles().length+" files" |
|
162 |
for (File pfile : txmDir.listFiles()) { |
|
163 |
print "." |
|
162 |
def files = txmDir.listFiles() |
|
163 |
println "Tokenizing entities "+files.length+" files" |
|
164 |
cpb = new ConsoleProgressBar(files.length) |
|
165 |
for (File pfile : files) { |
|
166 |
cpb.tick() |
|
164 | 167 |
TokenizeEntities tokenizer = new TokenizeEntities(pfile.toURI().toURL()); |
165 | 168 |
File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile()); |
166 | 169 |
if (tokenizer.process(outfile)) { |
... | ... | |
168 | 171 |
} |
169 | 172 |
outfile.delete(); |
170 | 173 |
} |
171 |
println ""
|
|
174 |
cpb.done()
|
|
172 | 175 |
|
173 | 176 |
//TOKENIZE |
174 |
println "Tokenizing "+txmDir.listFiles().length+" files from $txmDir"
|
|
177 |
println "Tokenizing "+files.length+" files from $txmDir"
|
|
175 | 178 |
File tokenizedDir = new File(binDir, "tokenized") |
176 | 179 |
tokenizedDir.mkdir() |
177 |
for (File pfile : txmDir.listFiles()) { |
|
178 |
print "." |
|
180 |
cpb = new ConsoleProgressBar(files.length) |
|
181 |
for (File pfile : files) { |
|
182 |
cpb.tick() |
|
179 | 183 |
String filename = pfile.getName().substring(0, pfile.getName().length()-4) |
180 | 184 |
File tfile = new File(tokenizedDir, pfile.getName()) |
181 | 185 |
try { |
... | ... | |
189 | 193 |
return false; |
190 | 194 |
} |
191 | 195 |
} |
192 |
println ""
|
|
196 |
cpb.done()
|
|
193 | 197 |
|
194 | 198 |
//TRANSFORM INTO XML-TEI-TXM |
195 |
println("Building XML-TXM ("+txmDir.listFiles().length+" files)") |
|
196 |
for (File tfile : tokenizedDir.listFiles()) { |
|
197 |
print "." |
|
199 |
files = tokenizedDir.listFiles() |
|
200 |
println("Building XML-TXM ("+files.length+" files)") |
|
201 |
cpb = new ConsoleProgressBar(files.length) |
|
202 |
for (File tfile : files) { |
|
203 |
cpb.tick() |
|
198 | 204 |
String filename = tfile.getName().substring(0, tfile.getName().length()-4) |
199 | 205 |
File xmlfile = new File(txmDir, tfile.getName()) |
200 | 206 |
|
... | ... | |
238 | 244 |
} |
239 | 245 |
} |
240 | 246 |
|
241 |
println "" |
|
247 |
cpb.done() |
|
248 |
|
|
242 | 249 |
return txmDir.listFiles() != null; |
243 | 250 |
} |
244 | 251 |
|
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 2259) | ||
---|---|---|
39 | 39 |
import org.txm.importer.cwb.CwbMakeAll |
40 | 40 |
import org.txm.importer.cwb.CwbProcess; |
41 | 41 |
import org.txm.importer.cwb.PatchCwbRegistry; |
42 |
import org.txm.utils.ConsoleProgressBar |
|
42 | 43 |
import org.txm.utils.Pair; |
43 | 44 |
|
44 | 45 |
/** |
... | ... | |
160 | 161 |
output.close(); |
161 | 162 |
|
162 | 163 |
println("Compiling "+xmlfiles.size()+" files") |
164 |
ConsoleProgressBar cpb = new ConsoleProgressBar(xmlfiles.size()) |
|
163 | 165 |
for (File f :xmlfiles) { |
164 |
if (f.exists()) |
|
166 |
if (f.exists()) { |
|
167 |
cpb.tick() |
|
165 | 168 |
if (!process(f)) { |
166 | 169 |
println("Failed to compile "+f) |
167 | 170 |
} |
171 |
} |
|
168 | 172 |
} |
169 |
|
|
173 |
cpb.done() |
|
170 | 174 |
if (!createOutput(cqpFile)) return false; |
171 | 175 |
output.write("</txmcorpus>\n") |
172 | 176 |
output.close(); |
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 2259) | ||
---|---|---|
323 | 323 |
|
324 | 324 |
public boolean doTokenizeStep() { |
325 | 325 |
|
326 |
new File(module.getBinaryDirectory(),"tokenized").deleteDir() |
|
326 | 327 |
new File(module.getBinaryDirectory(),"tokenized").mkdir() |
327 | 328 |
|
328 | 329 |
String outSideTextTagsRegex = ""; |
Formats disponibles : Unified diff