Révision 2999

tmp/org.txm.groovy.core/src/groovy/org/txm/macro/transcription/Vocapia2Transcriber.groovy (revision 2999)
37 37
		
38 38
		boolean flagWord = false
39 39
		def winfos = [:]
40
		def turninfos = [:]
40 41
		boolean other = false;
41 42
		String word = ""
42 43
		try {
......
92 93
							
93 94
							case "SpeechSegment": // <SpeechSegment ch="1" sconf="1.00" stime="9.94" etime="43.81" spkid="Enquêtrice" lang="fre" lconf="1.00" trs="1">
94 95
								writer.writeStartElement("Turn")
95
								writer.writeAttribute("speaker", parser.getAttributeValue(null, "spkid"))
96
								writer.writeAttribute("startTime", parser.getAttributeValue(null, "stime"))
97
								writer.writeAttribute("endTime", parser.getAttributeValue(null, "etime"))
96
								
97
								turninfos = ["speaker":parser.getAttributeValue(null, "spkid"),
98
									"startTime":parser.getAttributeValue(null, "stime"),
99
									"endTime":parser.getAttributeValue(null, "etime"),
100
								]
101
								for (String attr : turninfos.keySet()) {
102
									writer.writeAttribute(attr, turninfos[attr])
103
								}
104
								
98 105
								writer.writeCharacters("\n")
99 106
								writer.writeStartElement("Sync")
100 107
								writer.writeAttribute("time", parser.getAttributeValue(null, "stime"))
......
148 155
								flagWord = false
149 156
								word = word.trim()
150 157
								if (word.startsWith("*")) {
158
									//close current Turn and start a 'other' Turn
159
									writer.writeEndElement() // current Turn
160
									writer.writeStartElement("Turn")
161
									writer.writeAttribute("speaker", "other")
162
									writer.writeAttribute("startTime", winfos["time"])
163
									writer.writeAttribute("orig-speaker", turninfos["speaker"])
164
									writer.writeCharacters("\n")
151 165
									other = true
152 166
									word = word.substring(1)
153 167
								}
......
155 169
								String otherAttributeValue = Boolean.toString(other) // set now
156 170
							
157 171
								if (other && word.endsWith("*")) {
172
									
173
									//close the current 'other' Turn and restart the actual Turn
174
									writer.writeEndElement() // current 'other' Turn
175
									
176
									writer.writeStartElement("Turn") // rebuild the orig Turn with its infos
177
									turninfos["startTime"] = winfos["end"] // fix the startTime using the current word end time
178
									for (String attr : turninfos.keySet()) {
179
										writer.writeAttribute(attr, turninfos[attr])
180
									}
181
									writer.writeCharacters("\n")
182
									
158 183
									word = word.substring(0, word.length()-1)
159 184
									other = false
160 185
								}
......
178 203
								
179 204
								for (def punct : puncts) { // pre-retokenize if any
180 205
									writer.writeStartElement("w")
181
									writer.writeAttribute("time", winfos["time"])
182
									writer.writeAttribute("start", winfos["start"])
183
									writer.writeAttribute("end", winfos["start"])
206
									for (String attr : winfos.keySet()) {
207
										writer.writeAttribute(attr, winfos[attr])
208
									}
184 209
									writer.writeCharacters(punct)
185 210
									writer.writeEndElement() // w
186 211
									writer.writeCharacters("\n")
187
									
188
									
189 212
								}
190 213
							
191 214
								puncts = []
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/pager.groovy (revision 2999)
35 35
import org.txm.importer.ApplyXsl2
36 36
import org.txm.metadatas.MetadataGroup
37 37
import org.txm.metadatas.Metadatas
38
import org.txm.utils.TimeFormatter
38 39
import org.txm.utils.io.FileCopy;
39 40

  
40 41

  
......
55 56
	List<String> NoSpaceAfter;
56 57
	
57 58
	/** The pages. */
58
	def pages = [];
59
	def indexes = [];
59
	def pages = []
60
	def indexes = []
60 61
	
61 62
	/** The wordcount. */
62
	int wordcount = 0;
63
	int wordcount = 0
63 64
	
64 65
	/** The pagecount. */
65
	int pagecount = 0;
66
	int pagecount = 0
66 67
	
67 68
	/** The wordmax. */
68
	int wordmax = 10;
69
	int wordmax = 10
69 70
	
70 71
	/** The wordid. */
71 72
	String wordid;
72 73
	
73 74
	/** The first word. */
74
	boolean firstWord = true;
75
	boolean firstWord = true
75 76
	
76 77
	/** The wordvalue. */
77 78
	String wordvalue;
......
113 114
	
114 115
	File outfile;
115 116
	
116
	String corpusname ="";
117
	String corpusname =""
117 118
	String cuttingTag = "pb"
118
	String txtname;
119
	File htmlDir;
120
	File defaultDir;
121
	Metadatas metadatas;
119
	String txtname
120
	File htmlDir
121
	File defaultDir
122
	Metadatas metadatas
122 123
	
123
	def interviewers = [];
124
	def interviewers = null
124 125
	def eventTranslations = ["^^":"mot inconnu", "?":"orthographe incertaine",
125 126
		"()":"rupture de syntaxe", "b":"bruit indéterminé",
126 127
		"*":"mot corrigé",
......
133 134
		"pif":"inaudible", "r":"respiration",
134 135
		"rire":"rire du locuteur", "shh":"soufle électrique",
135 136
		"sif":"sifflement du locuteur", "tx":"toux"];
136
	String currentTime = "";
137
	boolean bold = false;
138
	int writenLength = 0;
139
	boolean spokenTurn = false;
140
	boolean firstSync = false;
141
	boolean firstWho = false;
137
	String currentUTime = ""
138
	String startTimeSp = ""
139
	String endTimeSp = ""
140
	String startTimeU = "0"
141
	String previousStartTimeU = "0"
142
	boolean bold = false
143
	int writenLength = 0
144
	boolean spokenTurn = false
145
	boolean firstSync = false
146
	boolean firstWho = false
147
	int nSilence = 0
142 148
	/**
143 149
	 * Instantiates a new pager.
144 150
	 *
......
307 313
									writer.writeEndElement(); // td
308 314
								}
309 315
								//get enqueteur to style their names
310
								if (name.startsWith("enq")) {
311
									interviewers.add(value)
316
								if (name.equals("out-of-text-to-edit-locutor")) {
317
									interviewers = /$value/
312 318
								}
313 319
								writer.writeEndElement(); // tr
314 320
							}
......
493 499
							firstWho = true;
494 500
							spokenTurn = false;
495 501
							overlapping = false
502
							nSilence = 0 // will count the number of silence written to avoid writting [silence] at the sp end
496 503
						
497 504
							writer.writeStartElement("p");
498 505
							writer.writeAttribute("class", "turn");
506
							writer.writeCharacters("\n");
499 507
						
508
							this.startTimeSp = parser.getAttributeValue(null, "start")
509
							this.endTimeSp = parser.getAttributeValue(null, "end")
510
						
500 511
							overlapping = ("true" == parser.getAttributeValue(null,"overlap"))
501 512
							String spid = parser.getAttributeValue(null, "who");
502 513
						
......
512 523
							break;
513 524
						case "u":
514 525
							writer.writeCharacters("\n");
515
							this.currentTime = parser.getAttributeValue(null,"time");
526
							this.previousStartTimeU = this.startTimeU
527
							this.startTimeU = parser.getAttributeValue(null, "start");
528
							this.currentUTime = parser.getAttributeValue(null, "time");
516 529
						
517 530
							if (previousElem == "u" && writenLength == 0) { // if previous u had no words, it was a silence
531
								def duration = ""
532
								try {
533
									def d = Float.parseFloat(this.startTimeU) - Float.parseFloat(this.previousStartTimeU)
534
									duration = " "+TimeFormatter.formatTime(d);
535
								} catch(Exception e) { e.printStackTrace()}
518 536
								writer.writeStartElement("span");
519 537
								writer.writeAttribute("class", "event");
520
								writer.writeCharacters("[silence]");
538
								writer.writeCharacters("[silence$duration]");
521 539
								writer.writeEndElement(); // span
522
								writer.writeEmptyElement("br");
540
								nSilence++
541
								//writer.writeEmptyElement("br");
523 542
							}
524 543
						
525 544
							String spk = parser.getAttributeValue(null, "who")
......
534 553
							previousSPK = spk
535 554
							if (overlapping) previousSPK = null
536 555
						
556
							writenLength = 0;
537 557
						//							writenLength = 0;
538 558
						/*writer.writeStartElement("span");
539 559
					 writer.writeAttribute("class", "sync");
......
652 672
						case "sp":
653 673
						//println "CLOSING: "+parser.getLocalName()
654 674
							endBoldIfNeeded()
655
							if (!spokenTurn) {
675
							if (!spokenTurn && nSilence == 0) {
656 676
								writer.writeStartElement("span");
657 677
								writer.writeAttribute("class", "event");
658
								writer.writeCharacters("[silence]");
678
								String duration = ""
679
								try {
680
									def d = Float.parseFloat(endTimeSp)-Float.parseFloat(startTimeSp)
681
									duration = " "+TimeFormatter.formatTime(d);
682
								} catch (Exception e) {e.printStackTrace()}
683
								writer.writeCharacters("[silence$duration]");
659 684
								writer.writeEndElement();
660 685
								writer.writeEmptyElement("br");
661 686
							}
......
680 705
						//writer.writeEndElement() // span@class=u
681 706
						//writer.writeEmptyElement("br");
682 707
						//if (overlapping) writer.writeEndElement(); // b
708
							writer.writeCharacters("\n");
683 709
							break;
684 710
						case "event":
685 711
							break;
......
694 720
							spokenTurn = true;
695 721
							int l = lastword.length();
696 722
							String endOfLastWord = "";
697
							if(l > 0)
723
							if (l > 0) {
698 724
								endOfLastWord = lastword.subSequence(l-1, l);
699
						
725
							}
700 726
							if (interpvalue != null) {
701 727
								interpvalue = interpvalue.replace("\"","&quot;");
702 728
							}
......
726 752
						
727 753
							if (interpvalue.contains("rapp1")) {
728 754
								writer.writeCharacters(" «");
729
							} else if (wordvalue == "\"") {
730
								// don't write this char
731
							} else {
732
								writer.writeStartElement("span");
733
								writer.writeAttribute("class", "word");
734
								writer.writeAttribute("title", interpvalue);
735
								writer.writeAttribute("id", wordid);
736
								writer.writeCharacters(wordvalue);
737
								writer.writeEndElement();
738
							}
755
							} 
756
						
757
							writer.writeStartElement("span");
758
							writer.writeAttribute("class", "word");
759
							writer.writeAttribute("title", interpvalue);
760
							writer.writeAttribute("id", wordid);
761
							writer.writeCharacters(wordvalue);
762
							writer.writeEndElement();
763
						
739 764
							if (interpvalue.contains("orth")) {
740 765
								writer.writeStartElement("span");
741 766
								writer.writeAttribute("class", "event");
......
748 773
								writer.writeCharacters("_[!]");
749 774
								writer.writeEndElement();
750 775
							}
751
						
752 776
							if (interpvalue.contains("rapp2")) {
753
								writer.writeCharacters(" » ");
777
								writer.writeCharacters("» ");
754 778
							}
755 779
						
756 780
							lastword=wordvalue;
......
817 841
	private void writeCurrentTime() {
818 842
		writer.writeStartElement("span");
819 843
		writer.writeAttribute("class", "sync");
820
		writer.writeCharacters(currentTime);
844
		writer.writeCharacters(currentUTime);
821 845
		
822
		writeMediaAccess(currentTime)
846
		writeMediaAccess(currentUTime)
823 847
		
824 848
		writer.writeEndElement() // span
825 849
	}
......
838 862
		
839 863
		writer.writeStartElement("span");
840 864
		writer.writeAttribute("class", "spk");
841
		if(interviewers.contains(spk)) {
842
			bold = true;
843
		} else {
844
			bold = false;
845
		}
865
		bold = interviewers != null && interviewers.matches(spk)
846 866
		spk = spk.replaceAll('^([^0-9]*)([0-9]+)$', '$1 $2');
847 867
		if (overlapping) {
848 868
			writer.writeCharacters("// ")
......
876 896
		}
877 897
	}
878 898
	
879
	//	private String formatTime(float time, boolean doshort)
880
	//	{
881
	//		String rez = " ";
882
	//		//		if(time >= 3600) // >= 1h
883
	//		//		{
884
	//		float h = time / 3600;
885
	//		time = time%3600;
886
	//		float min = (time%3600) / 60;
887
	//		int sec = (int)time%60;
888
	//
889
	//		if(min < 10)
890
	//			rez = ""+(int)h+":0"+(int)min;//+":"+time%60;
891
	//		else
892
	//			rez = ""+(int)h+":"+(int)min;//+":"+time%60;
893
	//		//if (!doshort)
894
	//		if (sec > 9)
895
	//			rez += ":"+(int)time%60;
896
	//		else
897
	//			rez += ":0"+(int)time%60;
898
	//		//		}
899
	//		//		else if(time >= 60) // >= 1min
900
	//		//		{
901
	//		//			int min = time/60;
902
	//		//			if(min < 10)
903
	//		//				rez = "00:0"+min;//+":"+time%60;
904
	//		//			else
905
	//		//				rez = "00:"+min;//+":"+time%60;
906
	//		//			if(!doshort)
907
	//		//				rez += ":"+(int)time%60;
908
	//		//		}
909
	//		//		else // < 60
910
	//		//		{
911
	//		//			if(time < 10)
912
	//		//				return " 0:0"+time;
913
	//		//			else
914
	//		//				return " 0:"+time;
915
	//		//		}
916
	//		return rez;
917
	//	}
918
	
919 899
	/**
920 900
	 * Gets the page files.
921 901
	 *
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/transcriberLoader.groovy (revision 2999)
225 225
					ArrayList<Pair<String, String>> metas = metadatas.get(filename)
226 226
					//println "filename=$filename metas= $metas"
227 227
					for (Pair p : metas) {
228
						if (p.getFirst().startsWith("enq")) {
229
							new RemoveSpeaker(infile, infile, p.getFirst())
228
						if (p.getFirst().startsWith("out-of-text-to-edit-locutor")) {
229
							new RemoveSpeaker(infile, infile, p.getSecond())
230 230
						}
231 231
					}
232 232
				}
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/RemoveSpeaker.groovy (revision 2999)
31 31
import org.txm.scripts.importer.*;
32 32
import org.txm.scripts.importer.graal.PersonalNamespaceContext
33 33
import org.txm.utils.*;
34
import org.txm.utils.xml.DomUtils
34 35
import org.txm.metadatas.*;
35 36

  
36 37
import java.io.File;
......
65 66
	 * @param outfile the outfile
66 67
	 * @param id the id
67 68
	 */
68
	public RemoveSpeaker(File transcriptionfile, File outfile, String id)
69
	{
69
	public RemoveSpeaker(File transcriptionfile, File outfile, String idRegex) {
70 70
		System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
71 71
				
72 72
		this.outfile = outfile;
73
		String xpathString = "//tei:u[@spk='"+id+"']";
73
//		String xpathString = "//u";
74 74
		//println "removing $xpathString in $transcriptionfile"
75
		DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
76
		//println "domFactory: $domFactory"
77
		domFactory.setNamespaceAware(true); // never forget this!
78
		domFactory.setXIncludeAware(true);
79
		DocumentBuilder builder = domFactory.newDocumentBuilder();
75
//		DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance();
76
//		//println "domFactory: $domFactory"
77
//		domFactory.setNamespaceAware(true); // never forget this!
78
//		domFactory.setXIncludeAware(true);
79
//		DocumentBuilder builder = domFactory.newDocumentBuilder();
80 80
		//println "builder $builder"
81
		doc = builder.parse(transcriptionfile);
81
		//doc = builder.parse(transcriptionfile);
82 82
		//println "doc $doc"
83
		doc = DomUtils.load(transcriptionfile)
84
		//def xpath = XPathFactory.newInstance().newXPath()
85
//		xpath.setNamespaceContext(new PersonalNamespaceContext());
86
		//def expr = xpath.compile(xpathString);
87
		def nodes = doc.getDocumentElement().getElementsByTagName("u")
83 88
		
84
		def xpath = XPathFactory.newInstance().newXPath()
85
		xpath.setNamespaceContext(new PersonalNamespaceContext());
86
		def expr = xpath.compile(xpathString);
87
		def nodes = expr.evaluate(doc, XPathConstants.NODESET);
89
		def reg = /$idRegex/
88 90
		
89 91
		for (def node : nodes) {
90
			//println "remove node "+node
91
			Element elem = (Element)node;
92
			elem.getParentNode().removeChild(node);
92
			
93
			//Element elem = (Element)node;
94
			
95
			String who = node.getAttribute("who") // [@who='"+idRegex+"']
96
			if (reg.matches(who)) {
97
				node.getParentNode().removeChild(node);
98
			}
93 99
		}
94 100
		save()
95 101
	}
......
99 105
	 *
100 106
	 * @return true, if successful
101 107
	 */
102
	private boolean save()
103
	{
108
	private boolean save() {
104 109
		try {
105 110
			// Création de la source DOM
106 111
			Source source = new DOMSource(doc);
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/TranscriberTokenizer.groovy (revision 2999)
70 70
	 * @param infile the infile
71 71
	 * @param outfile the outfile
72 72
	 */
73
	public TranscriberTokenizer(File infile, File outfile, String lang)
74
	{
73
	public TranscriberTokenizer(File infile, File outfile, String lang) {
75 74
		super(infile, outfile, lang)
76 75
		txtname = infile.getName();
77 76
		int idx = txtname.lastIndexOf(".")
......
96 95
				audio = "present"
97 96
			notation = s;
98 97
			event = "";
99
			
100
			if (s.startsWith("\"")) {
101
				rapp = true;
102
				event += "#rapp1";
103
			} else if(s.endsWith("\"")) {
104
				rapp = false;
105
				event += "#rapp2";
106
			}
107 98

  
99
			//TODO does not work (eg ' "word" '). This step should be done after the tokenizer step is done			
100
//			if (s.startsWith("\"") && s.endsWith("\"")) {
101
//				// not rapp1 or rapp2
102
//			} else if (s.startsWith("\"")) {
103
//				rapp = true;
104
//				event += "#rapp1";
105
//			} else if(s.endsWith("\"")) {
106
//				rapp = false;
107
//				event += "#rapp2";
108
//			}
109

  
108 110
			//test events
109 111
			if (s.startsWith("^^")) {
110 112
				event += "#orth";			
tmp/org.txm.groovy.core/src/groovy/org/txm/scripts/importer/transcriber/compiler.groovy (revision 2999)
90 90

  
91 91

  
92 92
	/** The interviewers. */
93
	ArrayList<String> interviewers = [];
93
	def interviewers = null
94 94
	static HashSet<String> sectionAttrs;
95 95

  
96 96
	/** The anatypes. */
......
481 481
							break;
482 482
						case "w":
483 483
						// concat spk id and ref
484
							String isEnq = (interviewers.contains(u_name))?"*":"";
484
							String isEnq = (interviewers != null && interviewers.matches(u_name))?"*":"";
485 485
							String ref = (u_name+", "+formatedTime+""+isEnq)
486 486
							if (ADD_TEXTID_TO_REF) ref = textid+", "+ref
487 487
							vForm +="\t"+u_name+"\t"+ref
......
508 508
								}
509 509
							}
510 510

  
511

  
512 511
							vForm = vForm.replaceAll("\n", "").replaceAll("&", "&amp;").replaceAll("<", "&lt;");
513 512

  
514 513
							if (removeinterviewers) {
515
								if (!interviewers.contains(u_name))
514
								if (!interviewers.matches(u_name))
516 515
									output.write(vForm+"\t"+wordid+vAna+"\n");
517 516
							} else {
518 517
								output.write(vForm+"\t"+wordid+vAna+"\n");
......
660 659

  
661 660
						for (int i = 0 ; i < parser.getAttributeCount() ; i ++) {
662 661
							list.add(new Pair(parser.getAttributeLocalName(i).replace("_","").toLowerCase(), parser.getAttributeValue(i)));
663
							if (parser.getAttributeLocalName(i).startsWith("enq"))
664
								interviewers.add(parser.getAttributeValue(i));
662
							if (parser.getAttributeLocalName(i).equals("out-of-text-to-edit-locutor"))
663
								interviewers = /${parser.getAttributeValue(i)}/;
665 664
						}
666 665
						return
667 666
					case "Topic":

Formats disponibles : Unified diff