Révision 2259

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/csv/Table2CorpusMacro.groovy (revision 2259)
1
package org.txm.macro.csv
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
12
		File inputFile;
13

  
14
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory")
15
		File outputDirectory;
16

  
17
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire")
18
		def textSelector;
19

  
20
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet")
21
		def structureSelector;
22

  
23
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice")
24
		def textIDColumn;
25

  
26
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale")
27
		def joinColumn;
28

  
29
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
30
		def textMetadataColumnList;
31

  
32
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
33
		def textContentColumnList;
34

  
35
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div")
36
		def structureTag;
37

  
38
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
39
		def structureMetadataColumnList;
40

  
41
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
42
		def structureContentColumnList;
43

  
44
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
45
		def metadataDateColumnList;
46
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
47
		def prefixContentColumnList;
48
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
49
		def listContentColumnList;
50

  
51

  
52
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
53
//		def structureOrderColumn;
54

  
55
if (!ParametersDialog.open(this)) return;
56

  
57
textMetadataColumnList = textMetadataColumnList.split(",")
58
textContentColumnList = textContentColumnList.split(",")
59
structureMetadataColumnList = structureMetadataColumnList.split(",")
60
structureContentColumnList = structureContentColumnList.split(",")
61
metadataDateColumnList = metadataDateColumnList.split(",")
62
prefixContentColumnList = prefixContentColumnList.split(",")
63
listContentColumnList = listContentColumnList.split(",")
64
int ti = textSelector.indexOf("=")
65
String p1 = textSelector.substring(0, ti)
66
String p2 = textSelector.substring(ti+1)
67
textSelector = [p1, p2]
68
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)]
69

  
70

  
71
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn"
72
println "textMetadataColumnList columns: $textMetadataColumnList"
73
println "textContentColumnList columns: $textContentColumnList"
74
println "structureMetadataColumnList columns: $structureMetadataColumnList"
75
println "structureContentColumnList columns: $structureContentColumnList"
76

  
77
println "text selector="+textSelector
78
println "structure selector="+structureSelector
79
println "structureTag="+structureTag
80

  
81
TableReader reader = new TableReader(inputFile);
82
if (!reader.readHeaders()) {
83
	println "Error: no header"
84
	return
85
}
86
def headers = Arrays.asList(reader.getHeaders())
87
println "table columns: $headers"
88

  
89
def ok = true
90
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList,
91
	"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList,
92
	"structureContentColumnList":structureContentColumnList]) {
93
	for (String m : list.value) {
94
		m = m.trim()
95
		if (!headers.contains(m)) {
96
			println "Error: missing ${list.key} column: $m"
97
			ok = false
98
		}
99
	}
100
}
101
if (!ok) { return; }
102

  
103
// group by text
104
def texts = new LinkedHashMap()
105
def nRecord = 0
106
while (reader.readRecord()) {
107
	nRecord++
108
	//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0])
109

  
110
	String id = reader.get(textIDColumn).trim()
111
	String join = reader.get(joinColumn).trim()
112
	String textSelectorValue = reader.get(textSelector[0]).trim()
113
	String structureSelectorValue = reader.get(structureSelector[0]).trim()
114
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
115
		if (!texts.containsKey(id)) texts[id] = []
116
		texts[id].add(0, reader.getRecord())
117
	} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
118
		if (!texts.containsKey(join)) texts[join] = []
119
		texts[join].add(reader.getRecord())
120
	} else {
121
		// ignore
122
	}
123
}
124
println "N lines: "+nRecord
125
println "N groups: "+texts.size()
126
if (texts.size()  == 0) {
127
	println "No text found. Aborting."
128
	return
129
}
130
outputDirectory.mkdir()
131

  
132
for (def id : texts.keySet()) {
133
	def toWrite = texts[id]
134
	def text = toWrite[0]
135
	String textSelectorValue = text.get(textSelector[0]).trim()
136
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
137
		println "Processing text: $id"
138

  
139
		File outputfile = new File(outputDirectory, id+".xml")
140
		XMLOutputFactory factory = XMLOutputFactory.newInstance()
141
		FileOutputStream output = new FileOutputStream(outputfile)
142
		XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
143

  
144
		writer.writeStartDocument("UTF-8","1.0")
145
		writer.writeCharacters("\n") // simple XML formating
146
		writer.writeStartElement("TEI")
147
		writer.writeCharacters("\n") // simple XML formating
148
		writer.writeStartElement("teiHeader")
149
		writer.writeEndElement() // teiHeader
150
		writer.writeCharacters("\n") // simple XML formating
151
		writer.writeStartElement("text")
152
		writer.writeAttribute("id", id)
153

  
154
		for (String att : textMetadataColumnList) {
155
			if (att in metadataDateColumnList) {
156
				writeMetadataDate(text, att, writer)
157
			} else {
158
				writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct
159
			}
160
		}
161

  
162
		writer.writeCharacters("\n") // simple XML formating
163

  
164
		writer.writeStartElement("metadata")
165
		writer.writeStartElement("list")
166
		writer.writeAttribute("type", "unordered")
167
		writer.writeCharacters("\n")
168
		textMetadataColumnList.each { att ->
169
			writer.writeStartElement("item")
170
			writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
171
			writer.writeEndElement() // item
172
			writer.writeCharacters("\n")
173
		}
174
		writer.writeEndElement() // list
175
		writer.writeEndElement() // metadata
176
		writer.writeCharacters("\n")
177

  
178
		for (String att : textContentColumnList) {
179
			if (att in prefixContentColumnList) {
180
				writeContentPrefix(text, att, writer)
181
			} else if (att in listContentColumnList) {
182
				writeContentList(text, att, writer)
183
			} else {
184
				writer.writeStartElement("p");
185
				writer.writeAttribute("type", att.trim())
186
				writer.writeCharacters("\n") // simple XML formating
187

  
188
				writer.writeStartElement("head")
189
				writer.writeStartElement("hi")
190
				writer.writeCharacters(att+" : ")
191
				writer.writeEndElement() // hi
192
				writer.writeEndElement() // head
193

  
194
				writer.writeCharacters(text.get(att)) // get textColumnList content
195
				writer.writeEndElement() // t
196
				writer.writeCharacters("\n") // simple XML formating)
197
			}
198
		}
199

  
200
		int pb_n = 1;
201
		for (int i = 1 ; i < toWrite.size() ; i++) {
202
			def record = toWrite[i]
203

  
204
			writer.writeEmptyElement("pb") // <pb/>
205
			writer.writeAttribute("n", ""+pb_n++)
206

  
207
			writer.writeStartElement(structureTag)
208

  
209
			for (String att : structureMetadataColumnList) {
210
				if (att in metadataDateColumnList) {
211
					writeMetadataDate(record, att, writer)
212
				} else {
213
					writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct
214
				}
215
			}
216
			writer.writeCharacters("\n")
217

  
218
			writer.writeStartElement("metadata")
219
			writer.writeStartElement("list")
220
			writer.writeAttribute("type", "unordered")
221
			writer.writeCharacters("\n")
222
			structureMetadataColumnList.each { att ->
223
				writer.writeStartElement("item")
224
				writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
225
				writer.writeEndElement() // item
226
				writer.writeCharacters("\n")
227
			}
228
			writer.writeEndElement() // list
229
			writer.writeEndElement() // metadata
230
			writer.writeCharacters("\n")
231

  
232
			for (String att : structureContentColumnList) {
233
				if (att in prefixContentColumnList) {
234
					writeContentPrefix(text, att, writer)
235
				} else if (att in listContentColumnList) {
236
					writeContentList(text, att, writer)
237
				} else {
238
					writer.writeStartElement("p");
239
					writer.writeAttribute("type", att.trim())
240
					writer.writeAttribute("id", record.get(textIDColumn))
241
					writer.writeCharacters("\n") // simple XML formating
242

  
243
					writer.writeStartElement("head")
244
					writer.writeStartElement("hi")
245
					writer.writeCharacters(att+" : ")
246
					writer.writeEndElement() // hi
247
					writer.writeEndElement() // head
248

  
249
					writer.writeCharacters(record.get(att)) // get textColumnList content
250
					writer.writeEndElement() // t
251
					writer.writeCharacters("\n") // simple XML formating
252
				}
253
			}
254

  
255
			writer.writeEndElement() // struct
256
			writer.writeCharacters("\n") // simple XML formating
257
		}
258

  
259
		writer.writeEndElement() // text
260
		writer.writeCharacters("\n") // simple XML formating
261
		writer.writeEndElement() // TEI
262
		writer.close()
263
		output.close()
264
		reader.close()
265
	} else {
266
		// error
267
		println "ERROR: '$id' text group with  no text line"
268
	}
269
}
270

  
271
def writeContentList(def record, def att, def writer) {
272
	writer.writeCharacters("\n")
273
	found = false
274
	def value = record.get(att)
275
	value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
276
		found = true
277
		writer.writeCharacters("\t")
278
		writer.writeStartElement("p")
279
		writer.writeAttribute("rend", "list")
280
		matches = (desc =~ /(?s)([^\n]+?)[\n]/)
281
		writer.writeCharacters(matches[0][1])
282
		writer.writeEndElement() // p
283
		writer.writeCharacters("\n")
284
	}
285
	if (!found) {
286
		writer.writeCharacters("\t")
287
		writer.writeStartElement("p")
288
		writer.writeAttribute("rend", "no-list")
289
		writer.writeCharacters(value)
290
		writer.writeEndElement() // p
291
		writer.writeCharacters("\n")
292
	}
293
}
294

  
295
def writeContentPrefix(def record, def att, def writer) {
296
	writer.writeCharacters("\n")
297
	writer.writeStartElement("list")
298
	writer.writeAttribute("rend", "prefixes")
299
	writer.writeAttribute("type", "unordered")
300
	writer.writeCharacters("\n")
301
	found = false
302
	def value = record.get(att)
303
	value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
304
		found = true
305
		writer.writeCharacters("\t")
306
		writer.writeStartElement("item")
307
		matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
308
		writer.writeAttribute("type", matches[0][1])
309
		writer.writeStartElement("span")
310
		writer.writeCharacters(matches[0][1]+" ")
311
		writer.writeEndElement() // span
312
		writer.writeCharacters(matches[0][2])
313
		writer.writeEndElement() // item
314
		writer.writeCharacters("\n")
315
	}
316
	if (!found) {
317
		writer.writeCharacters(value)
318
	}
319
	writer.writeEndElement() // list
320
	writer.writeCharacters("\n")
321
}
322

  
323
def writeMetadataDate(def record, def att, def writer) {
324
	String value = record.get(att)
325
	String att_normalized = AsciiUtils.buildAttributeId(att)
326
	matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
327
	writer.writeAttribute(att_normalized+"jour", matches[0][1])
328
	writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
329
	writer.writeAttribute(att_normalized+"mois", matches[0][2])
330
	writer.writeAttribute(att_normalized+"annee", matches[0][3])
331
	writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1])
332
}
333

  
334
/*
335
 String name = inputFile.getName()
336
 int idx = name.lastIndexOf(".")
337
 if (idx > 0) name = name.substring(0, idx)
338
 */
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2259)
351 351
							writer.writeAttribute("class", "section");
352 352
							String type = parser.getAttributeValue(null,"type");
353 353
							String desc = parser.getAttributeValue(null,"topic");
354
							if (type != null && desc != null) {
354
							String metadata = parser.getAttributeValue(null,"metadata");
355
							if (type != null || desc != null) {
355 356
								writer.writeStartElement("h3");
356
								writer.writeCharacters(type+": "+desc);
357
								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
358
									String name = parser.getAttributeLocalName(i);
359
									if (!"type".equals(name)
360
									&& !"topic".equals(name)
361
									&& !"startTime".equals(name)
362
									&& !"endTime".equals(name)) {
363
										writer.writeCharacters(", "+name+": "+parser.getAttributeValue(i));
357
								if (type != null || type.length() ==0) {
358
									writer.writeCharacters(type+": "+desc);
359
								} else {
360
									writer.writeCharacters(desc);
361
								}
362
								writer.writeEndElement(); // h3
363
								
364
								if (metadata != null && metadata.length() > 0) { // the metadata to show
365
									writer.writeStartElement("ul");
366
									for (def m : metadata.split("\t")) {
367
										writer.writeStartElement("li");
368
										writer.writeCharacters(m);
369
										writer.writeEndElement(); // li
364 370
									}
371
									writer.writeEndElement(); // ul
372
								} else if (parser.getAttributeCount() > 1) { // process all attributes
373
									writer.writeStartElement("ul");
374
									for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
375
										String name = parser.getAttributeLocalName(i);
376
										if (!"type".equals(name)
377
										&& !"topic".equals(name)
378
										&& !"startTime".equals(name)
379
										&& !"endTime".equals(name)) {
380
											writer.writeStartElement("li");
381
											writer.writeCharacters(""+name+": "+parser.getAttributeValue(i));
382
											writer.writeEndElement(); // li
383
										}
384
									}
385
									writer.writeEndElement(); // ul
365 386
								}
366
								writer.writeEndElement(); // h3
367 387
							}
368 388
							break;
369 389
						case "sp":
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/FixINAXML.groovy (revision 2259)
1
import org.txm.utils.ConsoleProgressBar
2

  
3
def directory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs")
4
println "Fixing $directory"
5
def files = directory.listFiles()
6
ConsoleProgressBar cpb = new ConsoleProgressBar(files.size())
7
for (File trsFile : files) {
8
//File trsFile = new File(directory, "AFE86004868.trs")
9
	cpb.tick()
10
	if (!trsFile.getName().endsWith(".trs")) {
11
		continue;
12
	}
13
	String content = trsFile.getText("UTF-8")
14
	content = content.replaceAll("punct=\"([^\"]+)\"\">", "punct=\"\$1\">")
15
	content = content.replaceAll("<unk>", "???")
16
	content = content.replaceAll(" Time=\"", " time=\"")
17
	trsFile.setText(content, "UTF-8")
18
}
19
cpb.done()
20
println "Done"
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/AddSections.groovy (revision 2259)
1
import java.nio.charset.Charset
2

  
3
import org.txm.utils.AsciiUtils
4
import org.txm.utils.ConsoleProgressBar
5
import org.txm.utils.CsvReader
6
import java.time.LocalTime
7
import java.time.format.DateTimeFormatter
8

  
9
File metadataFile = new File("/home/mdecorde/TEMP/ANTRACT/AF/metadata.tsv")
10
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs")
11
def idTRSColumn = "Lien notice principale"
12
def typeColumns = ["Identifiant de la notice"]
13
def topicColumns = ["Titre propre", "Genre", "antract_debut", "antract_fin"]
14
def startTimeColumn = "antract_debut"
15
def endTimeColumn = "antract_fin"
16
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"]
17
def metadataColumns = ["Date de diffusion", "Descripteurs (Aff. Col.)", "Durée", "Générique (Aff. Col.)", "Genre", "Identifiant de la notice", "Langue VO / VE ", "Lien notice principale", "Nature de production ", "Nom fichier segmenté (info)", "Notes du titre ", "Producteurs (Aff.)", "Résumé", "Séquences", "Thématique", "Titre propre", "Type de date", "Type de notice"]
18

  
19
if (!trsDirectory.exists()) {
20
	println "$trsDirectory not found"
21
	return
22
}
23

  
24

  
25
CsvReader reader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
26
reader.readHeaders()
27
def header = reader.getHeaders()
28
if (!header.contains(idTRSColumn)) {
29
	println "No TRS ID $idTRSColumn column found"
30
	return
31
}
32
if (!header.contains(startTimeColumn)) {
33
	println "No start time $startTimeColumn column found"
34
	return
35
}
36
if (!header.contains(endTimeColumn)) {
37
	println "No end time $endTimeColumn column found"
38
	return
39
}
40
for (def col : metadataColumns) {
41
	if (!header.contains(endTimeColumn)) {
42
		println "No $col column found"
43
		return
44
	}
45
}
46
for (def col : typeColumns) {
47
	if (!header.contains(endTimeColumn)) {
48
		println "No type $col column found"
49
		return
50
	}
51
}
52
for (def col : topicColumns) {
53
	if (!header.contains(endTimeColumn)) {
54
		println "No topic $col column found"
55
		return
56
	}
57
}
58

  
59
File outputDirectory = new File(trsDirectory, "out")
60
println "Writing result to $outputDirectory..."
61

  
62
dateTimeFormatter = DateTimeFormatter.ISO_LOCAL_TIME
63
def strTotime(def str) {
64
	if (str.lastIndexOf(":") == -1) {
65
		return null
66
	}
67
	
68
	bonusFrame = Integer.parseInt(str.substring(str.lastIndexOf(":")+1))
69
	//if (str.contains("135475")) println "ERROR $str in $infos -> $bonusFrame"
70
	if (bonusFrame > 25) {
71
		bonusFrame=0;
72
	}
73
	totalFrame = str.substring(0, str.lastIndexOf(":"))
74
	
75
	LocalTime time1 = LocalTime.parse(totalFrame, dateTimeFormatter)
76
	totalFrame = (time1.getHour()*60*60) + (time1.getMinute()*60) + time1.getSecond()
77

  
78
		def ret = totalFrame + (bonusFrame/25)
79
		return ret
80
	}
81

  
82
def sectionGroupsToInsert = [:]
83
while (reader.readRecord()) {
84
	String id = reader.get(idTRSColumn).trim()
85
	if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4)
86
	if (id.length() == 0) continue;
87
		
88
	if (!sectionGroupsToInsert.containsKey(id)) {
89
		sectionGroupsToInsert[id] = []
90
	}
91
	def section = sectionGroupsToInsert[id]
92
	
93
	
94
	if (reader.get(startTimeColumn) != null && reader.get(startTimeColumn).length() > 0) { // ignore non timed sections
95
		
96
		def m = [:]
97
		
98
		for (def todo : ["metadata":metadataColumns, "topic":topicColumns, "type":typeColumns]) {
99
			def data = []
100
			for (def col : todo.value) {
101
				if (reader.get(col).trim().length() > 0) {
102
					data << col+": "+reader.get(col).trim().replace("\n", "")
103
				}
104
			}
105
			m[todo.key] = data.join("\t")
106
		}
107
		
108
		m["startTime"] = strTotime(reader.get(startTimeColumn))
109
		m["endTime"] = strTotime(reader.get(endTimeColumn))
110
		m["synchronized"] = "true"
111
				
112
		section << [m["startTime"], m["endTime"], m]
113
	}
114
}
115

  
116
println "N sections: "+sectionGroupsToInsert.size()
117

  
118
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size())
119
for (String id : sectionGroupsToInsert.keySet()) {
120
	cpb.tick()
121
	File trsFile = new File(trsDirectory, id+".trs")
122
	if (!trsFile.exists()) {
123
		continue
124
	}
125
	//println "Processing $id..."
126
	def sections = sectionGroupsToInsert[id]
127
	sections = sections.sort() { a, b -> a[0] <=> b[0] ?: a[1] <=> b[1] }
128
	
129
	// Open input file
130
	def slurper = new XmlParser();
131
	slurper.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false)
132
		
133
	def trs = slurper.parse(trsFile.toURI().toString())
134
	def trsEpisodes = trs.Episode // 1
135
	if (trsEpisodes.size() > 1) {
136
		println "multiple Episode node in $trsFile"
137
		continue
138
	}
139
	def trsEpisode = trsEpisodes[0]
140
	def trsSections =  trs.Episode.Section // 1
141
	if (trsSections.size() > 1) {
142
		println "multiple Section node in $trsFile"
143
		continue
144
	}
145
	def trsSection = trsSections[0]
146
	
147
	def turns = trsSection.Turn
148
	def newSections = []
149
	def iSection = 0;
150
	def currentSection = null
151
	def currentNode = null
152

  
153
	for (def turn : turns) {
154
		def start = Float.parseFloat(turn.@startTime)
155
		def end = Float.parseFloat(turn.@endTime)
156
		
157
		def found = null;
158
		for (int i = iSection ; i < sections.size() ; i++) {
159
			if (end < sections[i][0]) { // Turn is before section
160
				
161
			} else if (sections[i][1] < start) { // Turn is before section
162
				
163
			} else {
164
				found = sections[i]
165
				iSection = i
166
				break; // stop searching and set iSection to accelerate next search
167
			}
168
		}
169
		
170
		if (found == null) {
171
			if (currentSection != null || currentNode == null) {
172
				currentNode = new Node(trsEpisode, "Section", ["type":"non synchronisée", "topic":"none", "startTime":turn.@startTime, "endTime":"", "synchronized":"false"] )
173
				currentSection = null;
174
			}
175
		} else {
176
			if (found != currentSection) {
177
				if (currentNode != null && currentNode.@synchronized == "false") {
178
					def tmp = currentNode.Turn
179
					currentNode.@endTime = tmp[-1].@endTime
180
				}
181
				
182
				currentSection = found
183
				currentNode = new Node(trsEpisode, "Section", currentSection[2])
184
				//trsEpisode.appendNode(currentNode)
185
			}
186
		}
187
		trsSection.remove(turn)
188
		currentNode.append(turn)
189
	}
190
	
191
	//remove the initial section
192
	trsEpisode.remove(trsSection)
193
	
194
	outputDirectory.mkdir()
195
	File outfile = new File(outputDirectory, trsFile.getName())
196
	outfile.withWriter("UTF-8") { writer ->
197
		writer.write('<?xml version="1.0" encoding="UTF-8"?>\n<!DOCTYPE Trans SYSTEM "trans-14.dtd">\n')
198
		def printer = new groovy.util.XmlNodePrinter(new PrintWriter(writer))
199
		printer.setPreserveWhitespace(true)
200
		printer.print(trs)
201
	}
202
}
203
cpb.done()
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 2259)
54 54
import org.txm.utils.*;
55 55
import org.txm.metadatas.*;
56 56

  
57
// TODO: Auto-generated Javadoc
58 57
/**
59 58
 * The Class importer.
60 59
 */
......
116 115

  
117 116
		// TRS -> TEI
118 117
		println "Converting TRS to TEI "+trsfiles.size()+" files"
118
		ConsoleProgressBar cpb = new ConsoleProgressBar(trsfiles.size())
119 119
		for (File infile : trsfiles) {
120
			print "."
120
			cpb.tick()
121 121
			String textid = infile.getName()
122 122
			int idx = textid.indexOf(".trs")
123 123
			if (idx > 0) textid = textid.substring(0, idx)
......
128 128
				//return false;
129 129
			}
130 130
		}
131
		println ""
131
		cpb.done()
132 132

  
133 133
		if (metadatas != null) {
134 134
			if (metadatas.getHeadersList().size() > 0) {
......
137 137

  
138 138
				trsfiles = txmDir.listFiles();
139 139
				trsfiles.sort()
140
				cpb = new ConsoleProgressBar(trsfiles.size())
140 141
				for (File infile : trsfiles) {
141 142
					File outfile = new File(txmDir, "tmp.xml")
142 143
					if (metadatas != null && metadatas.isInitialized()) {
143
						print "."
144
						cpb.tick()
144 145
						if (!metadatas.injectMetadatasInXml(infile, outfile, "text")) {
145 146
							println("Failed to inject metadata in "+infile)
146 147
							outfile.delete()
......
152 153
						outfile.renameTo(infile)
153 154
					}
154 155
				}
156
				cpb.done()
155 157
			}
156 158
		}
157 159

  
158
		println ""
159 160

  
160 161
		// TOKENIZER ENTITIES
161
		println "Tokenizing entities "+txmDir.listFiles().length+" files"
162
		for (File pfile : txmDir.listFiles()) {
163
			print "."
162
		def files = txmDir.listFiles()
163
		println "Tokenizing entities "+files.length+" files"
164
		cpb = new ConsoleProgressBar(files.length)
165
		for (File pfile : files) {
166
			cpb.tick()
164 167
			TokenizeEntities tokenizer = new TokenizeEntities(pfile.toURI().toURL());
165 168
			File outfile = File.createTempFile("tok", ".xml", pfile.getParentFile());
166 169
			if (tokenizer.process(outfile)) {
......
168 171
			}
169 172
			outfile.delete();
170 173
		}
171
		println ""
174
		cpb.done()
172 175

  
173 176
		//TOKENIZE
174
		println "Tokenizing "+txmDir.listFiles().length+" files from $txmDir"
177
		println "Tokenizing "+files.length+" files from $txmDir"
175 178
		File tokenizedDir = new File(binDir, "tokenized")
176 179
		tokenizedDir.mkdir()
177
		for (File pfile : txmDir.listFiles()) {
178
			print "."
180
		cpb = new ConsoleProgressBar(files.length)
181
		for (File pfile : files) {
182
			cpb.tick()
179 183
			String filename = pfile.getName().substring(0, pfile.getName().length()-4)
180 184
			File tfile = new File(tokenizedDir, pfile.getName())
181 185
			try {
......
189 193
				return false;
190 194
			}
191 195
		}
192
		println ""
196
		cpb.done()
193 197

  
194 198
		//TRANSFORM INTO XML-TEI-TXM
195
		println("Building XML-TXM ("+txmDir.listFiles().length+" files)")
196
		for (File tfile : tokenizedDir.listFiles()) {
197
			print "."
199
		files = tokenizedDir.listFiles()
200
		println("Building XML-TXM ("+files.length+" files)")
201
		cpb = new ConsoleProgressBar(files.length)
202
		for (File tfile : files) {
203
			cpb.tick()
198 204
			String filename = tfile.getName().substring(0, tfile.getName().length()-4)
199 205
			File xmlfile = new File(txmDir, tfile.getName())
200 206

  
......
238 244
			}
239 245
		}
240 246

  
241
		println ""
247
		cpb.done()
248
				
242 249
		return txmDir.listFiles() != null;
243 250
	}
244 251

  
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 2259)
39 39
import org.txm.importer.cwb.CwbMakeAll
40 40
import org.txm.importer.cwb.CwbProcess;
41 41
import org.txm.importer.cwb.PatchCwbRegistry;
42
import org.txm.utils.ConsoleProgressBar
42 43
import org.txm.utils.Pair;
43 44

  
44 45
/**
......
160 161
		output.close();
161 162

  
162 163
		println("Compiling "+xmlfiles.size()+" files")
164
		ConsoleProgressBar cpb = new ConsoleProgressBar(xmlfiles.size())
163 165
		for (File f :xmlfiles) {
164
			if (f.exists())
166
			if (f.exists()) {
167
				cpb.tick()
165 168
				if (!process(f)) {
166 169
					println("Failed to compile "+f)
167 170
				}
171
			}
168 172
		}
169

  
173
		cpb.done()
170 174
		if (!createOutput(cqpFile)) return false;
171 175
		output.write("</txmcorpus>\n")
172 176
		output.close();
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZImporter.groovy (revision 2259)
323 323

  
324 324
	public boolean doTokenizeStep() {
325 325

  
326
		new File(module.getBinaryDirectory(),"tokenized").deleteDir()
326 327
		new File(module.getBinaryDirectory(),"tokenized").mkdir()
327 328

  
328 329
		String outSideTextTagsRegex = "";

Formats disponibles : Unified diff