Révision 2268

tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 2268)
112 112
			Toolbox.getMetadataColumnSeparator(),
113 113
			Toolbox.getMetadataTextSeparator(), 1)
114 114
}
115
else
115
else {
116 116
	println "no metadata file: "+allMetadataFile
117
}
117 118

  
118 119
File propertyFile = new File(srcDir, "import.properties")//default
119 120
Properties props = new Properties();
......
289 290
		cpb.done()
290 291

  
291 292
		//copy transcriber.css
292
		File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css")
293
		File cssfile = new File(Toolbox.getTxmHomePath(), "css/transcriber.css")
294
		File cssTXMFile = new File(Toolbox.getTxmHomePath(), "css/txm.css")
293 295
		if (cssfile.exists() && htmlDir.exists()) {
294
			FileCopy.copy(cssfile, new File(htmlDir, "transcriber.css"));

295
			FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
296
			FileCopy.copy(cssfile, new File(htmlDir, "onepage/transcriber.css"));
297
			FileCopy.copy(cssfile, new File(htmlDir, "default/txm.css"));
296 298
			FileCopy.copy(cssfile, new File(htmlDir, "default/transcriber.css"));
297 299
		}
298 300
		
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2268)
44 44
 *  
45 45
 */
46 46
class pager {
47

  
47
	
48 48
	boolean SIMPLE_TOOLTIP = false; // show less properties in word tooltips
49 49
	String ENQ_HIGHLIGHT_ELEMENT = "b"
50

  
50
	
51 51
	List<String> NoSpaceBefore;
52

  
52
	
53 53
	/** The No space after. */
54 54
	List<String> NoSpaceAfter;
55

  
55
	
56 56
	/** The pages. */
57 57
	def pages = [];
58 58
	def indexes = [];
59

  
59
	
60 60
	/** The wordcount. */
61 61
	int wordcount = 0;
62

  
62
	
63 63
	/** The pagecount. */
64 64
	int pagecount = 0;
65

  
65
	
66 66
	/** The wordmax. */
67 67
	int wordmax = 10;
68

  
68
	
69 69
	/** The wordid. */
70 70
	String wordid;
71

  
71
	
72 72
	/** The first word. */
73 73
	boolean firstWord = true;
74

  
74
	
75 75
	/** The wordvalue. */
76 76
	String wordvalue;
77

  
77
	
78 78
	/** The interpvalue. */
79 79
	String interpvalue;
80

  
80
	
81 81
	/** The lastword. */
82 82
	String lastword = " ";
83

  
83
	
84 84
	/** The wordtype. */
85 85
	String wordtype;
86

  
86
	
87 87
	/** The flagform. */
88 88
	boolean flagform = false;
89

  
89
	
90 90
	/** The flaginterp. */
91 91
	boolean flaginterp = false;
92

  
92
	
93 93
	boolean flagcomment = false;
94

  
94
	
95 95
	/** The url. */
96 96
	private def url;
97

  
97
	
98 98
	/** The input data. */
99 99
	private def inputData;
100

  
100
	
101 101
	/** The factory. */
102 102
	private def factory;
103

  
103
	
104 104
	/** The parser. */
105 105
	private XMLStreamReader parser;
106

  
106
	
107 107
	/** The writer. */
108 108
	XMLStreamWriter writer;
109 109
	BufferedOutputStream output;
110

  
110
	
111 111
	File txmfile;
112

  
112
	
113 113
	File outfile;
114

  
114
	
115 115
	String corpusname ="";
116 116
	String cuttingTag = "pb"
117 117
	String txtname;
118 118
	File htmlDir;
119 119
	File defaultDir;
120 120
	Metadatas metadatas;
121

  
121
	
122 122
	def interviewers = [];
123 123
	def eventTranslations = ["^^":"mot inconnu", "?":"orthographe incertaine",
124 124
		"()":"rupture de syntaxe", "b":"bruit indéterminé",
......
160 160
		this.txmfile = txmfile;
161 161
		this.htmlDir = htmlDir;
162 162
		this.txtname = txtname;
163

  
163
		
164 164
		inputData = url.openStream();
165 165
		factory = XMLInputFactory.newInstance();
166 166
		parser = factory.createXMLStreamReader(inputData);
167

  
167
		
168 168
		defaultDir = new File(htmlDir, "default")
169 169
		defaultDir.mkdir()
170 170
		new File(htmlDir, "onepage").mkdir()
171 171
		outfile = new File(htmlDir, "onepage/${txtname}.html");
172 172
		createOutput(outfile)
173

  
173
		
174 174
		try {
175 175
			process();
176 176
		} catch(Exception e) {
......
181 181
			}
182 182
		}
183 183
	}
184

  
184
	
185 185
	/**
186 186
	 * Creates the output.
187 187
	 *
......
194 194
			XMLOutputFactory outfactory = XMLOutputFactory.newInstance();
195 195
			output = new BufferedOutputStream(new FileOutputStream(outfile))
196 196
			writer = outfactory.createXMLStreamWriter(output, "UTF-8");//create a new file
197

  
197
			
198 198
			return true;
199 199
		} catch (Exception e) {
200 200
			System.out.println(e.getLocalizedMessage());
201 201
			return false;
202 202
		}
203 203
	}
204

  
204
	
205 205
	/** The events. */
206 206
	List<String> events = [];
207 207
	String previousEvent = "", nextEvent = "";
......
209 209
	 * Process.
210 210
	 */
211 211
	void process() {
212

  
212
		
213 213
		String previousElem = "";
214 214
		boolean parolesRaportees = false;
215 215
		boolean firstWord = true;
......
221 221
		ArrayList<String> whos = [];
222 222
		HashMap<String, String> speakers = new HashMap<String, String>();
223 223
		HashMap<String, String> topics = new HashMap<String, String>();
224

  
224
		
225 225
		writer.writeStartDocument("UTF-8","1.0");
226 226
		writer.writeStartElement("html");
227 227
		//<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
......
231 231
		writer.writeAttribute("charset", "UTF-8");
232 232
		writer.writeEndElement();
233 233
		writer.writeStartElement("head");
234

  
234
		
235 235
		//<link rel="stylesheet" type="text/css" href="class.css" />
236 236
		writer.writeStartElement("link");
237 237
		writer.writeAttribute("rel", "stylesheet");
......
239 239
		writer.writeAttribute("href", "transcriber.css");
240 240
		writer.writeEndElement();
241 241
		writer.writeEndElement();
242

  
242
		
243 243
		nbBreak++
244 244
		writer.writeStartElement("body");
245
		writer.writeAttribute("class", "txmeditionpage")
245 246
		writer.writeEmptyElement("pb");
246 247
		writer.writeAttribute("id", ""+nbBreak);
247 248
		pages << new File(defaultDir, "${txtname}_${nbBreak}.html")
248

  
249
		
249 250
		for (int event = parser.next(); event != XMLStreamConstants.END_DOCUMENT; event = parser.next()) {
250 251
			switch (event) {
251 252
				case XMLStreamConstants.START_ELEMENT:
252 253
					localname = parser.getLocalName();
253 254
					switch (localname) {
254 255
						case "text":
256
						
255 257
							writer.writeStartElement("h2");
256
							writer.writeAttribute("class","titre");
258
							writer.writeAttribute("class","title");
257 259
							String title = parser.getAttributeValue(null, "title");
260
						
258 261
							if (title != null) {
259 262
								writer.writeCharacters(title);
260 263
							} else {
261 264
								writer.writeCharacters("Transcription "+txmfile.getName().substring(0, txmfile.getName().length() - 4));
262 265
							}
263
							writer.writeEndElement();
264
							if(metadatas != null) {
266
						
267
							writer.writeEmptyElement("br");
268
							writer.writeStartElement("a");
269
							writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '0.0')");
270
							writer.writeAttribute("style", "cursor: pointer;")
271
							writer.writeAttribute("class", "play-media")
272
							writer.writeCharacters(" ♪♪");
273
							writer.writeEndElement(); // a
274
						
275
							writer.writeEndElement(); // h2
276
						
277
							String subtitle = parser.getAttributeValue(null, "subtitle");
278
							if (subtitle != null && subtitle.length() > 0) {
279
								writer.writeStartElement("h3");
280
								writer.writeAttribute("class", "subtitle");
281
								writer.writeCharacters(subtitle);
282
								writer.writeEndElement(); // h3
283
							}
284
						
285
//							println "metadatas != null: "+(metadatas != null)
286
//							if (metadatas != null) {
265 287
								writer.writeStartElement("table");
266 288
								boolean grey = false;
267
								for (String name : metadatas.getPropertyNames()) {
268
									if ("title" == name) continue; // ignore "title" metadata
289
								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
290
									String name = parser.getAttributeName(i);
291
									String value = parser.getAttributeValue(i);
292
									
293
									if ("title" == name) {
294
										continue; // ignore "title" metadata
295
									}
296
									
269 297
									grey = !grey;
270 298
									writer.writeStartElement("tr");
271
									if (grey)
299
									if (grey) {
272 300
										writer.writeAttribute("style","background-color:lightgrey;")
273
									String value = parser.getAttributeValue(null, name);
301
									}
302
									
274 303
									if (value != null) {
275 304
										writer.writeStartElement("td");
276 305
										writer.writeCharacters(name);
......
286 315
									writer.writeEndElement();
287 316
								}
288 317
								writer.writeEndElement();
289
							}
318
//							}
290 319
							break;
291 320
						case "Topics":
292 321
						/*writer.writeStartElement("h2");
......
347 376
							flagcomment = true;
348 377
							break;
349 378
						case "div":
350
							writer.writeStartElement("div");
351
							writer.writeAttribute("class", "section");
352
							String type = parser.getAttributeValue(null,"type");
353
							String desc = parser.getAttributeValue(null,"topic");
354
							String metadata = parser.getAttributeValue(null,"metadata");
379
						
380
							nbBreak++
381
							writer.writeEmptyElement("pb");
382
							writer.writeAttribute("id", ""+nbBreak);
383
							writer.writeCharacters("\n");
384
						
385
							pages << new File(defaultDir, "${txtname}_${nbBreak}.html")
386
							indexes << wordid
387
						
388
							wordcount = 0;
389
							shouldBreak = false;
390
						
391
							writer.writeStartElement("div")
392
							writer.writeAttribute("class", "section")
393
						
394
							String type = parser.getAttributeValue(null, "type")
395
							writer.writeAttribute("type", ""+type)
396
							String desc = parser.getAttributeValue(null, "topic")
397
						
355 398
							if (type != null || desc != null) {
356
								writer.writeStartElement("h3");
357
								if (type != null || type.length() ==0) {
399
								writer.writeStartElement("h2");
400
								writer.writeAttribute("class", "section-title")
401
								if (type != null || type.length() == 0) {
358 402
									writer.writeCharacters(type+": "+desc);
359 403
								} else {
360
									writer.writeCharacters(desc);
404
									writer.writeCharacters(desc)
361 405
								}
362
								writer.writeEndElement(); // h3
363 406
								
364
								if (metadata != null && metadata.length() > 0) { // the metadata to show
365
									writer.writeStartElement("ul");
366
									for (def m : metadata.split("\t")) {
367
										writer.writeStartElement("li");
368
										writer.writeCharacters(m);
369
										writer.writeEndElement(); // li
407
								if (parser.getAttributeValue(null,"startTime") != null) {
408
									writer.writeEmptyElement("br");
409
									writer.writeStartElement("a")
410
									writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+parser.getAttributeValue(null,"startTime")+"')");
411
									writer.writeAttribute("style", "cursor: pointer;")
412
									writer.writeAttribute("class", "play-media")
413
									writer.writeCharacters(" ♪♪")
414
									writer.writeEndElement() // a
415
								}
416
								
417
								writer.writeEndElement(); // h2
418
							}
419
						
420
							String metadata = parser.getAttributeValue(null, "metadata")
421
							if (metadata != null && metadata.length() > 0) { // the metadata to show
422
								writer.writeStartElement("ul")
423
								//println "metadata=$metadata"
424
								for (def m : metadata.split("<li>")) {
425
									writer.writeStartElement("li")
426
									writer.writeCharacters(m)
427
									writer.writeEndElement() // li
428
								}
429
								writer.writeEndElement() // ul
430
							} else if (parser.getAttributeCount() > 1) { // process all attributes
431
								writer.writeStartElement("ul")
432
								for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
433
									String name = parser.getAttributeLocalName(i)
434
									if (!"type".equals(name)
435
									&& !"topic".equals(name)
436
									&& !"startTime".equals(name)
437
									&& !"endTime".equals(name)) {
438
										writer.writeStartElement("li")
439
										writer.writeCharacters(""+name+": "+parser.getAttributeValue(i))
440
										writer.writeEndElement() // li
370 441
									}
371
									writer.writeEndElement(); // ul
372
								} else if (parser.getAttributeCount() > 1) { // process all attributes
373
									writer.writeStartElement("ul");
374
									for (int i = 0 ; i < parser.getAttributeCount() ; i++) {
375
										String name = parser.getAttributeLocalName(i);
376
										if (!"type".equals(name)
377
										&& !"topic".equals(name)
378
										&& !"startTime".equals(name)
379
										&& !"endTime".equals(name)) {
380
											writer.writeStartElement("li");
381
											writer.writeCharacters(""+name+": "+parser.getAttributeValue(i));
382
											writer.writeEndElement(); // li
383
										}
384
									}
385
									writer.writeEndElement(); // ul
386 442
								}
443
								writer.writeEndElement(); // ul
387 444
							}
445
						
388 446
							break;
389 447
						case "sp":
390 448
							endBoldIfNeeded()
......
392 450
							firstWho = true;
393 451
							spokenTurn = false;
394 452
							overlapping = false
395

  
453
						
396 454
							writer.writeStartElement("p");
397 455
							writer.writeAttribute("class", "turn");
398

  
456
						
399 457
							overlapping = ("true" == parser.getAttributeValue(null,"overlap"))
400 458
							String spid = parser.getAttributeValue(null,"speaker");
459
						
401 460
							whos = []
402 461
							if (overlapping) {
403 462
								writer.writeEmptyElement("br");
404 463
								writeSpeaker(parser.getAttributeValue(null,"speaker"), false)
405

  
464
								
406 465
								writer.writeEmptyElement("br");
407 466
								whos = spid.split(" ")
408 467
							}
409

  
468
						
410 469
							break;
411 470
						case "u":
412 471
							writer.writeCharacters("\n");
413 472
							this.currentTime = parser.getAttributeValue(null,"time");
414

  
473
						
415 474
							if (previousElem == "u" && writenLength == 0) { // if previous u had no words, it was a silence
416 475
								writer.writeStartElement("span");
417 476
								writer.writeAttribute("class", "event");
......
419 478
								writer.writeEndElement(); // span
420 479
								writer.writeEmptyElement("br");
421 480
							}
422

  
481
						
423 482
							String spk = parser.getAttributeValue(null, "spk")
424 483
							if (spk != null && spk != previousSPK) {
425 484
								endBoldIfNeeded()
......
427 486
								writeSpeaker(parser.getAttributeValue(null, "spk"), overlapping)
428 487
								startBoldIfNeeded()
429 488
							}
430

  
489
						
431 490
							writeCurrentTime()
432 491
							previousSPK = spk
433

  
492
						
434 493
						//							writenLength = 0;
435 494
						/*writer.writeStartElement("span");
436 495
					 writer.writeAttribute("class", "sync");
437 496
					 writer.writeCharacters("["+parser.getAttributeValue(null,"time")+"]");
438 497
					 writer.writeEndElement();*/
439

  
498
						
440 499
							break;
441 500
						case "event":
442 501
							spokenTurn = true;
......
458 517
										events.remove(events.size()-1)
459 518
								}
460 519
								else if (parser.getAttributeValue(null, "extent") == "begin") 	{
461

  
520
									
462 521
									writer.writeCharacters(" ["+desc+"> ");
463 522
									events.add(desc)
464 523
								}
......
484 543
									wordid = (parser.getAttributeValue(i));
485 544
									break;
486 545
								}
487

  
546
						
488 547
							wordcount++;
489 548
							if (wordcount >= wordmax) {
490 549
								shouldBreak = true;
491 550
							}
492

  
551
						
493 552
							if (firstWord) {
494 553
								indexes << wordid
495 554
								firstWord = false;
496 555
							}
497

  
556
						
498 557
							break;
499

  
558
						
500 559
						case "ana":
501

  
560
						
502 561
							String type = parser.getAttributeValue(null,"type").substring(1);
503 562
							if (SIMPLE_TOOLTIP) {
504 563
								if (type.contains("lemma") || type.contains("pos")) {
......
510 569
								interpvalue+=", "+type+"="
511 570
							}
512 571
							break;
513

  
572
						
514 573
						case "form":
515 574
							wordvalue="";
516 575
							interpvalue ="";
......
536 595
							break;
537 596
						case "Speaker":
538 597
							break;
539

  
598
						
540 599
						case "div":
541 600
						//writer.writeCharacters("}");
542

  
601
						
543 602
							writer.writeEndElement(); // div
544 603
							writer.writeCharacters("\n");
545 604
							break;
......
553 612
								writer.writeEndElement();
554 613
								writer.writeEmptyElement("br");
555 614
							}
556

  
615
						
557 616
							writer.writeEndElement(); // p
558

  
617
						
559 618
							if (shouldBreak) {
560 619
								nbBreak++
561 620
								writer.writeEmptyElement("pb");
562 621
								writer.writeAttribute("id", ""+nbBreak);
563 622
								writer.writeCharacters("\n");
564

  
623
								
565 624
								pages << new File(defaultDir, "${txtname}_${nbBreak}.html")
566 625
								indexes << wordid
567

  
626
								
568 627
								wordcount = 0;
569 628
								shouldBreak = false;
570 629
							}
......
590 649
							String endOfLastWord = "";
591 650
							if(l > 0)
592 651
								endOfLastWord = lastword.subSequence(l-1, l);
593

  
652
						
594 653
							if(interpvalue != null)
595 654
								interpvalue = interpvalue.replace("\"","&quot;");
596 655
							if(events.size() > 0)
597 656
								interpvalue = interpvalue.replace("event=", "event="+events.toString().replace("\"","&quot;")); // remove ", "
598

  
657
						
599 658
							if(nextEvent.length() > 0)
600 659
							{
601 660
								interpvalue = interpvalue.replace("event=", "event="+nextEvent+", ")
......
617 676
								//								println " SPACE"
618 677
								writer.writeCharacters(" ");
619 678
							}
620

  
679
						
621 680
							if (interpvalue.contains("rapp1")) {
622 681
								writer.writeCharacters(" «");
623 682
							} else if (wordvalue == "\"") {
......
642 701
								writer.writeCharacters("_[!]");
643 702
								writer.writeEndElement();
644 703
							}
645

  
704
						
646 705
							if (interpvalue.contains("rapp2")) {
647 706
								writer.writeCharacters(" » ");
648 707
							}
649

  
708
						
650 709
							lastword=wordvalue;
651 710
							break;
652 711
					}
653

  
712
				
654 713
					break;
655

  
714
				
656 715
				case XMLStreamConstants.CHARACTERS:
657 716
					if(flagform)
658 717
						if(parser.getText().length() > 0)
......
664 723
			}
665 724
		}
666 725
		writer.writeEndElement();
667

  
726
		
668 727
		writer.writeEmptyElement("pb");
669 728
		nbBreak++
670 729
		writer.writeAttribute("id", ""+nbBreak);
671

  
730
		
672 731
		writer.writeEndElement();
673 732
		writer.close();
674 733
		output.close();
675 734
		if (parser != null) parser.close();
676 735
		if (inputData != null) inputData.close();
677

  
736
		
678 737
		File txmhome = new File(org.txm.Toolbox.getTxmHomePath());
679 738
		File xlsDir  = new File(txmhome, "xsl");
680 739
		File xslfile = new File(xlsDir,"breakByMilestone.xsl");
......
685 744
		//		println "html: "+outfile
686 745
		//		println "pages: "+pages
687 746
		//		println "words: "+indexes
688

  
689

  
747
		
748
		
690 749
		if (pages.size() > 1) {
691 750
			for (int i = 1 ; i < nbBreak ; i++) {
692 751
				ApplyXsl2 a = new ApplyXsl2(xslfile.getAbsolutePath());
693
				String[] params = ["pbval1", i,"pbval2", i+1];
694

  
752
				String[] params = ["pbval1", i, "pbval2", i+1];
753
				
695 754
				File resultfile = pages[i-1]
696 755
				//println "BBmilestones: "+i+" "+(i+1)+" in file "+resultfile
697 756
				//println "process $outfile -> $resultfile"
......
702 761
			FileCopy.copy(outfile, page)
703 762
		}
704 763
	}
705

  
706
	private void writeCurrentTime()
707
	{
764
	
765
	private void writeCurrentTime() {
708 766
		writer.writeStartElement("span");
709 767
		writer.writeAttribute("class", "sync");
710 768
		writer.writeCharacters(currentTime);
769
		
770
		writer.writeStartElement("a");
771
		writer.writeAttribute("onclick", "txmcommand('id', 'org.txm.backtomedia.commands.function.BackToMedia', 'corpus', '"+corpusname+"', 'text', '"+txtname+"', 'time', '"+currentTime+"')");
772
		writer.writeAttribute("style", "cursor: pointer;")
773
		writer.writeAttribute("class", "play-media")
774
		writer.writeCharacters(" ♪♪");
775
		writer.writeEndElement(); // a
776
		
711 777
		writer.writeEndElement();
712 778
	}
713

  
779
	
714 780
	private void writeSpeaker(String spk, boolean overlapping) {
715

  
781
		
716 782
		writer.writeStartElement("span");
717 783
		writer.writeAttribute("class", "spk");
718 784
		if(interviewers.contains(spk))
......
721 787
			bold = false;
722 788
		spk = spk.replaceAll('^([^0-9]*)([0-9]+)$', '$1 $2');
723 789
		if (overlapping) writer.writeCharacters("// ")
790
		
724 791
		writer.writeCharacters(spk+": ")
792
		
725 793
		writer.writeEndElement(); // span@class=spk
726 794
	}
727

  
795
	
728 796
	private String translateEvent(String desc) {
729 797
		if(eventTranslations.containsKey(desc))
730 798
			return eventTranslations.get(desc);
731 799
		else
732 800
			return desc;
733 801
	}
734

  
802
	
735 803
	boolean boldOpenned = false;
736 804
	private void startBoldIfNeeded() {
737 805
		if (bold) {
......
739 807
			boldOpenned = true;
740 808
		}
741 809
	}
742

  
810
	
743 811
	private endBoldIfNeeded() {
744 812
		if (boldOpenned) {
745 813
			//			println "CLOSE BOLD"
......
747 815
			boldOpenned = false;
748 816
		}
749 817
	}
750

  
818
	
751 819
	//	private String formatTime(float time, boolean doshort)
752 820
	//	{
753 821
	//		String rez = " ";
......
787 855
	//		//		}
788 856
	//		return rez;
789 857
	//	}
790

  
858
	
791 859
	/**
792 860
	 * Gets the page files.
793 861
	 *
......
796 864
	public ArrayList<File> getPageFiles() {
797 865
		return pages;
798 866
	}
799

  
867
	
800 868
	/**
801 869
	 * Gets the idx.
802 870
	 *
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/AddSections.groovy (revision 2268)
10 10
File trsDirectory = new File("/home/mdecorde/TEMP/ANTRACT/AF/trs")
11 11
def idTRSColumn = "Lien notice principale"
12 12
def typeColumns = ["Identifiant de la notice"]
13
def topicColumns = ["Titre propre", "Genre", "antract_debut", "antract_fin"]
13
def topicColumns = ["Titre propre", "Notes du titre"]
14 14
def startTimeColumn = "antract_debut"
15 15
def endTimeColumn = "antract_fin"
16 16
//def metadataColumns = ["Identifiant de la notice", "Titre propre", "antract_debut", "antract_fin"]
......
21 21
	return
22 22
}
23 23

  
24

  
24
println "Loading data from $metadataFile..."
25 25
CsvReader reader = new CsvReader(metadataFile.getAbsolutePath(), "\t".charAt(0), Charset.forName("UTF-8"))
26 26
reader.readHeaders()
27 27
def header = reader.getHeaders()
......
80 80
	}
81 81

  
82 82
def sectionGroupsToInsert = [:]
83
println "Reading data..."
83 84
while (reader.readRecord()) {
84 85
	String id = reader.get(idTRSColumn).trim()
85 86
	if (id.endsWith(".mp4")) id = id.substring(0, id.length()-4)
......
95 96
		
96 97
		def m = [:]
97 98
		
98
		for (def todo : ["metadata":metadataColumns, "topic":topicColumns, "type":typeColumns]) {
99
		for (def todo : ["topic":topicColumns, "type":typeColumns]) {
99 100
			def data = []
100 101
			for (def col : todo.value) {
101 102
				if (reader.get(col).trim().length() > 0) {
......
104 105
			}
105 106
			m[todo.key] = data.join("\t")
106 107
		}
108
		for (def col : metadataColumns) {
109
			m[AsciiUtils.buildAttributeId(col)] = reader.get(col)
110
		}
107 111
		
108 112
		m["startTime"] = strTotime(reader.get(startTimeColumn))
109 113
		m["endTime"] = strTotime(reader.get(endTimeColumn))
......
113 117
	}
114 118
}
115 119

  
116
println "N sections: "+sectionGroupsToInsert.size()
120
println "Inserting sections... "+sectionGroupsToInsert.size()
117 121

  
118 122
ConsoleProgressBar cpb = new ConsoleProgressBar(sectionGroupsToInsert.keySet().size())
119 123
for (String id : sectionGroupsToInsert.keySet()) {
......
181 185
				
182 186
				currentSection = found
183 187
				currentNode = new Node(trsEpisode, "Section", currentSection[2])
184
				//trsEpisode.appendNode(currentNode)
185 188
			}
186 189
		}
187 190
		trsSection.remove(turn)
......
201 204
	}
202 205
}
203 206
cpb.done()
207
println "Done."
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/importer.groovy (revision 2268)
176 176
		//TOKENIZE
177 177
		println "Tokenizing "+files.length+" files from $txmDir"
178 178
		File tokenizedDir = new File(binDir, "tokenized")
179
		tokenizedDir.deleteDir()
179 180
		tokenizedDir.mkdir()
180 181
		cpb = new ConsoleProgressBar(files.length)
181 182
		for (File pfile : files) {
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/xtz/XTZDefaultPagerStep.groovy (revision 2268)
323 323

  
324 324
						rend = getAttributeValue(parser, null, "rend")
325 325
						if (rend == null) rend = "";
326
						
326 327
						switch (localname) {
327 328
							case "text":
328 329
								LinkedHashMap attributes = new LinkedHashMap();
tmp/org.txm.groovy.core/src/groovy/org/txm/macro/office/Table2CorpusMacro.groovy (revision 2268)
1
package org.txm.macro.csv
2

  
3
import org.kohsuke.args4j.*
4
import groovy.transform.Field
5
import java.nio.charset.Charset
6
import org.txm.rcp.swt.widget.parameters.*
7
import org.txm.utils.*
8
import javax.xml.stream.*
9
import java.net.URL
10

  
11
@Field @Option(name="inputFile", usage="CSV File", widget="File", required=false, def="file.xlsx")
12
		File inputFile;
13

  
14
@Field @Option(name="outputDirectory", usage="output directory", widget="File", required=false, def="directory")
15
		File outputDirectory;
16

  
17
@Field @Option(name="textSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sommaire")
18
		def textSelector;
19

  
20
@Field @Option(name="structureSelector", usage="column_to_test=regexp", widget="String", required=false, def="Type de notice=Notice sujet")
21
		def structureSelector;
22

  
23
@Field @Option(name="textIDColumn", usage="text id column", widget="String", required=false, def="Identifiant de la notice")
24
		def textIDColumn;
25

  
26
@Field @Option(name="joinColumn", usage="jointure column, values should point to the textIDColumn values", widget="String", required=false, def="Lien notice principale")
27
		def joinColumn;
28

  
29
@Field @Option(name="textMetadataColumnList", usage="text metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
30
		def textMetadataColumnList;
31

  
32
@Field @Option(name="textContentColumnList", usage="text content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
33
		def textContentColumnList;
34

  
35
@Field @Option(name="structureTag", usage="structure metadata columns", widget="String", required=false, def="div")
36
		def structureTag;
37

  
38
@Field @Option(name="structureMetadataColumnList", usage="structure metadata columns", widget="String", required=false, def="Identifiant de la notice,Date de diffusion,Durée,Genre,Identifiant Matériels (info.)")
39
		def structureMetadataColumnList;
40

  
41
@Field @Option(name="structureContentColumnList", usage="structure content columns", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
42
		def structureContentColumnList;
43

  
44
@Field @Option(name="metadataDateColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
45
		def metadataDateColumnList;
46
@Field @Option(name="prefixContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
47
		def prefixContentColumnList;
48
@Field @Option(name="listContentColumnList", usage="metadata columns of type=Date", widget="String", required=false, def="")
49
		def listContentColumnList;
50

  
51

  
52
//@Field @Option(name="structureOrderColumn", usage="structure column coding structure order", widget="String", required=false, def="Titre propre,Résumé,Descripteurs (Aff. Lig.),Descripteurs (Aff. Col.),Séquences")
53
//		def structureOrderColumn;
54

  
55
if (!ParametersDialog.open(this)) return;
56

  
57
textMetadataColumnList = textMetadataColumnList.split(",")
58
textContentColumnList = textContentColumnList.split(",")
59
structureMetadataColumnList = structureMetadataColumnList.split(",")
60
structureContentColumnList = structureContentColumnList.split(",")
61
metadataDateColumnList = metadataDateColumnList.split(",")
62
prefixContentColumnList = prefixContentColumnList.split(",")
63
listContentColumnList = listContentColumnList.split(",")
64
int ti = textSelector.indexOf("=")
65
String p1 = textSelector.substring(0, ti)
66
String p2 = textSelector.substring(ti+1)
67
textSelector = [p1, p2]
68
structureSelector = [structureSelector.substring(0, structureSelector.indexOf("=")), structureSelector.substring(structureSelector.indexOf("=")+1)]
69

  
70

  
71
println "textIDColumn, joinColumn=$textIDColumn, $joinColumn"
72
println "textMetadataColumnList columns: $textMetadataColumnList"
73
println "textContentColumnList columns: $textContentColumnList"
74
println "structureMetadataColumnList columns: $structureMetadataColumnList"
75
println "structureContentColumnList columns: $structureContentColumnList"
76

  
77
println "text selector="+textSelector
78
println "structure selector="+structureSelector
79
println "structureTag="+structureTag
80

  
81
TableReader reader = new TableReader(inputFile);
82
if (!reader.readHeaders()) {
83
	println "Error: no header"
84
	return
85
}
86
def headers = Arrays.asList(reader.getHeaders())
87
println "table columns: $headers"
88

  
89
def ok = true
90
for (def list : ["selection":[textIDColumn, joinColumn], "textMetadataColumnList":textMetadataColumnList,
91
	"textContentColumnList":textContentColumnList, "structureMetadataColumnList":structureMetadataColumnList,
92
	"structureContentColumnList":structureContentColumnList]) {
93
	for (String m : list.value) {
94
		m = m.trim()
95
		if (!headers.contains(m)) {
96
			println "Error: missing ${list.key} column: $m"
97
			ok = false
98
		}
99
	}
100
}
101
if (!ok) { return; }
102

  
103
// group by text
104
def texts = new LinkedHashMap()
105
def nRecord = 0
106
while (reader.readRecord()) {
107
	nRecord++
108
	//println "record="+reader.getRecord().get(textSelector[0])+" "+reader.getRecord().get(structureSelector[0])
109

  
110
	String id = reader.get(textIDColumn).trim()
111
	String join = reader.get(joinColumn).trim()
112
	String textSelectorValue = reader.get(textSelector[0]).trim()
113
	String structureSelectorValue = reader.get(structureSelector[0]).trim()
114
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
115
		if (!texts.containsKey(id)) texts[id] = []
116
		texts[id].add(0, reader.getRecord())
117
	} else if (structureSelectorValue != null && structureSelectorValue.matches(structureSelector[1])) {
118
		if (!texts.containsKey(join)) texts[join] = []
119
		texts[join].add(reader.getRecord())
120
	} else {
121
		// ignore
122
	}
123
}
124
println "N lines: "+nRecord
125
println "N groups: "+texts.size()
126
if (texts.size()  == 0) {
127
	println "No text found. Aborting."
128
	return
129
}
130
outputDirectory.mkdir()
131

  
132
for (def id : texts.keySet()) {
133
	def toWrite = texts[id]
134
	def text = toWrite[0]
135
	String textSelectorValue = text.get(textSelector[0]).trim()
136
	if (textSelectorValue != null && textSelectorValue.matches(textSelector[1])) {
137
		println "Processing text: $id"
138

  
139
		File outputfile = new File(outputDirectory, id+".xml")
140
		XMLOutputFactory factory = XMLOutputFactory.newInstance()
141
		FileOutputStream output = new FileOutputStream(outputfile)
142
		XMLStreamWriter writer = factory.createXMLStreamWriter(output, "UTF-8")
143

  
144
		writer.writeStartDocument("UTF-8","1.0")
145
		writer.writeCharacters("\n") // simple XML formating
146
		writer.writeStartElement("TEI")
147
		writer.writeCharacters("\n") // simple XML formating
148
		writer.writeStartElement("teiHeader")
149
		writer.writeEndElement() // teiHeader
150
		writer.writeCharacters("\n") // simple XML formating
151
		writer.writeStartElement("text")
152
		writer.writeAttribute("id", id)
153

  
154
		for (String att : textMetadataColumnList) {
155
			if (att in metadataDateColumnList) {
156
				writeMetadataDate(text, att, writer)
157
			} else {
158
				writer.writeAttribute(AsciiUtils.buildAttributeId(att), text.get(att)) // struct
159
			}
160
		}
161

  
162
		writer.writeCharacters("\n") // simple XML formating
163

  
164
		writer.writeStartElement("metadata")
165
		writer.writeStartElement("list")
166
		writer.writeAttribute("type", "unordered")
167
		writer.writeCharacters("\n")
168
		textMetadataColumnList.each { att ->
169
			writer.writeStartElement("item")
170
			writer.writeCharacters(att+" : "+text.get(att).replaceAll("\n", ";"))
171
			writer.writeEndElement() // item
172
			writer.writeCharacters("\n")
173
		}
174
		writer.writeEndElement() // list
175
		writer.writeEndElement() // metadata
176
		writer.writeCharacters("\n")
177

  
178
		for (String att : textContentColumnList) {
179
			if (att in prefixContentColumnList) {
180
				writeContentPrefix(text, att, writer)
181
			} else if (att in listContentColumnList) {
182
				writeContentList(text, att, writer)
183
			} else {
184
				writer.writeStartElement("p");
185
				writer.writeAttribute("type", att.trim())
186
				writer.writeCharacters("\n") // simple XML formating
187

  
188
				writer.writeStartElement("head")
189
				writer.writeStartElement("hi")
190
				writer.writeCharacters(att+" : ")
191
				writer.writeEndElement() // hi
192
				writer.writeEndElement() // head
193

  
194
				writer.writeCharacters(text.get(att)) // get textColumnList content
195
				writer.writeEndElement() // t
196
				writer.writeCharacters("\n") // simple XML formating)
197
			}
198
		}
199

  
200
		int pb_n = 1;
201
		for (int i = 1 ; i < toWrite.size() ; i++) {
202
			def record = toWrite[i]
203

  
204
			writer.writeEmptyElement("pb") // <pb/>
205
			writer.writeAttribute("n", ""+pb_n++)
206

  
207
			writer.writeStartElement(structureTag)
208

  
209
			for (String att : structureMetadataColumnList) {
210
				if (att in metadataDateColumnList) {
211
					writeMetadataDate(record, att, writer)
212
				} else {
213
					writer.writeAttribute(AsciiUtils.buildAttributeId(att), record.get(att)) // struct
214
				}
215
			}
216
			writer.writeCharacters("\n")
217

  
218
			writer.writeStartElement("metadata")
219
			writer.writeStartElement("list")
220
			writer.writeAttribute("type", "unordered")
221
			writer.writeCharacters("\n")
222
			structureMetadataColumnList.each { att ->
223
				writer.writeStartElement("item")
224
				writer.writeCharacters(att+" : "+record.get(att).replaceAll("\n", ";"))
225
				writer.writeEndElement() // item
226
				writer.writeCharacters("\n")
227
			}
228
			writer.writeEndElement() // list
229
			writer.writeEndElement() // metadata
230
			writer.writeCharacters("\n")
231

  
232
			for (String att : structureContentColumnList) {
233
				if (att in prefixContentColumnList) {
234
					writeContentPrefix(text, att, writer)
235
				} else if (att in listContentColumnList) {
236
					writeContentList(text, att, writer)
237
				} else {
238
					writer.writeStartElement("p");
239
					writer.writeAttribute("type", att.trim())
240
					writer.writeAttribute("id", record.get(textIDColumn))
241
					writer.writeCharacters("\n") // simple XML formating
242

  
243
					writer.writeStartElement("head")
244
					writer.writeStartElement("hi")
245
					writer.writeCharacters(att+" : ")
246
					writer.writeEndElement() // hi
247
					writer.writeEndElement() // head
248

  
249
					writer.writeCharacters(record.get(att)) // get textColumnList content
250
					writer.writeEndElement() // t
251
					writer.writeCharacters("\n") // simple XML formating
252
				}
253
			}
254

  
255
			writer.writeEndElement() // struct
256
			writer.writeCharacters("\n") // simple XML formating
257
		}
258

  
259
		writer.writeEndElement() // text
260
		writer.writeCharacters("\n") // simple XML formating
261
		writer.writeEndElement() // TEI
262
		writer.close()
263
		output.close()
264
		reader.close()
265
	} else {
266
		// error
267
		println "ERROR: '$id' text group with  no text line"
268
	}
269
}
270

  
271
def writeContentList(def record, def att, def writer) {
272
	writer.writeCharacters("\n")
273
	found = false
274
	def value = record.get(att)
275
	value.findAll( /(?s)([^\n]+?)[\n]/ ).each { desc ->
276
		found = true
277
		writer.writeCharacters("\t")
278
		writer.writeStartElement("p")
279
		writer.writeAttribute("rend", "list")
280
		matches = (desc =~ /(?s)([^\n]+?)[\n]/)
281
		writer.writeCharacters(matches[0][1])
282
		writer.writeEndElement() // p
283
		writer.writeCharacters("\n")
284
	}
285
	if (!found) {
286
		writer.writeCharacters("\t")
287
		writer.writeStartElement("p")
288
		writer.writeAttribute("rend", "no-list")
289
		writer.writeCharacters(value)
290
		writer.writeEndElement() // p
291
		writer.writeCharacters("\n")
292
	}
293
}
294

  
295
def writeContentPrefix(def record, def att, def writer) {
296
	writer.writeCharacters("\n")
297
	writer.writeStartElement("list")
298
	writer.writeAttribute("rend", "prefixes")
299
	writer.writeAttribute("type", "unordered")
300
	writer.writeCharacters("\n")
301
	found = false
302
	def value = record.get(att)
303
	value.findAll( /(?s)[A-Z]{3}:? *([^;\n]+?) +[;\n]/ ).each { desc ->
304
		found = true
305
		writer.writeCharacters("\t")
306
		writer.writeStartElement("item")
307
		matches = (desc =~ /(?s)([A-Z]{3}):? *([^;\n]+?) +[;\n]/)
308
		writer.writeAttribute("type", matches[0][1])
309
		writer.writeStartElement("span")
310
		writer.writeCharacters(matches[0][1]+" ")
311
		writer.writeEndElement() // span
312
		writer.writeCharacters(matches[0][2])
313
		writer.writeEndElement() // item
314
		writer.writeCharacters("\n")
315
	}
316
	if (!found) {
317
		writer.writeCharacters(value)
318
	}
319
	writer.writeEndElement() // list
320
	writer.writeCharacters("\n")
321
}
322

  
323
def writeMetadataDate(def record, def att, def writer) {
324
	String value = record.get(att)
325
	String att_normalized = AsciiUtils.buildAttributeId(att)
326
	matches = (value =~ /([0-9]{2})\/([0-9]{2})\/([0-9]{4})/)
327
	writer.writeAttribute(att_normalized+"jour", matches[0][1])
328
	writer.writeAttribute(att_normalized+"joursemaine", new java.text.SimpleDateFormat('EEEE').format(Date.parse("dd/MM/yyyy", value)))
329
	writer.writeAttribute(att_normalized+"mois", matches[0][2])
330
	writer.writeAttribute(att_normalized+"annee", matches[0][3])
331
	writer.writeAttribute(att_normalized+"tri", matches[0][3]+"-"+matches[0][2]+"-"+matches[0][1])
332
}
333

  
334
/*
335
 String name = inputFile.getName()
336
 int idx = name.lastIndexOf(".")
337
 if (idx > 0) name = name.substring(0, idx)
338
 */

Formats disponibles : Unified diff