Révision 7e2d37e1 src/R/nucleominer.R

b/src/R/nucleominer.R
106 106

  
107 107

  
108 108

  
109
lod_score_vecs = structure(function # Likelihood ratio
110
### Compute the likelihood log of two set of value from two models Vs. a unique model.
109
llr_score_nvecs = structure(function # Likelihood ratio
110
### Compute the log likelihood ratio of two or more set of value.
111 111
(
112
x ,##<< First vector.
113
y ##<< Second vector.
112
  xs ##<< list of vectors.
114 113
) {
115
	if (length(x) <=1 | length(y) <= 1) {
116
		return(NA)
117
	}
118
  meanX = mean(x)
119
  sdX = sd(x)
120
  meanY = mean(y)
121
  sdY = sd(y)
122
  meanXY = mean(c(x,y))
123
  sdXY = sd(c(x,y))
124
  llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
125
  llY = sum(log(dnorm(y,mean=meanY,sd=sdY)))
126
  llXY = sum(log(dnorm(c(x,y),mean=meanXY,sd=sdXY)))
127
  ratio = llX + llY - llXY
114
  l = length(xs)
115
  if (l < 1) {
116
    return(NA)
117
  }
118
  if (l == 1) {
119
    return(1)
120
  }
121
  sumllX = 0
122
  for (i in 1:l) {
123
    x = xs[[i]]
124
  	if (length(x) <= 1) {
125
  		return(NA)
126
  	}
127
    meanX = mean(x)
128
    sdX = sd(x)
129
    llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
130
    sumllX = sumllX + llX
131
  }
132
  meanXglo = mean(unlist(xs))
133
  sdXglo = sd(unlist(xs))
134
  llXYZ = sum(log(dnorm(unlist(xs),mean=meanXglo,sd=sdXglo)))
135
  ratio = sumllX - llXYZ
128 136
  return(ratio)
129
### Returns the likelihood ratio.
137
### Returns the log likelihood ratio.
130 138
}, ex=function(){
131 139
  # LOD score for 2 set of values
132 140
  mean1=5; sd1=2; card2 = 250
......
139 147
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
140 148
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
141 149
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
142
  lod_score_vecs(x1,x2)
150
  llr_score_nvecs(list(x1,x2))
143 151
 })
144 152

  
145 153
dfadd = structure(function# Adding list to a dataframe.
......
207 215
  }
208 216
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
209 217
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
210
  tf_outputs = tf_outputs[tf_outputs$correlation >= corr_thres,]
211
  tf_outputs = tf_outputs[order(tf_outputs$correlation,decreasing=TRUE),]
218
  tf_outputs = tf_outputs[tf_outputs$correlation.score >= corr_thres,]
219
  tf_outputs = tf_outputs[order(tf_outputs$correlation.score, decreasing=TRUE),]
212 220
  i = 1
213 221
  while (i <= length(tf_outputs[,1])) {
214
    lb = tf_outputs[i,]$low
215
    ub = tf_outputs[i,]$up
216
    tf_outputs = tf_outputs[!(tf_outputs$low <= (ub-ol_bp) & tf_outputs$up > ub) & !(tf_outputs$up >= (lb+ol_bp) & tf_outputs$low < lb),]
222
    lb = tf_outputs[i,]$lower_bound
223
    ub = tf_outputs[i,]$upper_bound
224
    tf_outputs = tf_outputs[!(tf_outputs$lower_bound <= (ub-ol_bp) & tf_outputs$upper_bound > ub) & !(tf_outputs$upper_bound >= (lb+ol_bp) & tf_outputs$lower_bound < lb),]
217 225
    i = i+1
218 226
  }
219 227
  return(tf_outputs)
......
265 273

  
266 274

  
267 275
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes.
268
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{lod_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
276
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{llr_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
269 277
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
270
lod_thres=20, ##<< Log likelihood ration threshold.
278
llr_thres=20, ##<< Log likelihood ration threshold.
271 279
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
272 280
){
273 281
	end_of_tracks = function(tracks) {
......
301 309
		return(clusters)
302 310
	}
303 311
	strain = samples[[1]]$strain
304
	lod_scores = c()
312
	llr_scores = c()
305 313
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
306 314
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
307 315
  # compute clusters
......
311 319
  indexes = c()
312 320
  track_readers = c()
313 321
  current_nuc = NULL
314
	lod_score = lod_thres + 1
322
	llr_score = llr_thres + 1
315 323
  # Read nucs from TF outputs
316 324
  tf_outs = list()
317 325
	i = 1
......
356 364
    new_upper_bound = new_nuc$upper_bound
357 365

  
358 366
    if (!is.null(current_nuc)) {
359
			lod_score = lod_score_vecs(current_nuc$original_reads,new_nuc$original_reads)
360
			lod_scores = c(lod_scores,lod_score)
367
			llr_score = llr_score_nvecs(list(current_nuc$original_reads,new_nuc$original_reads))
368
			llr_scores = c(llr_scores,llr_score)
361 369
		}
362
		# print(paste(lod_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
363
		if (is.na(lod_score)) {
364
			lod_score = lod_thres + 1
370
		# print(paste(llr_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
371
		if (is.na(llr_score)) {
372
			llr_score = llr_thres + 1
365 373
		}
366
		# Store lod_score
367
		new_nuc$lod_score = lod_score
368
	  if (lod_score < lod_thres) {
374
		# Store llr_score
375
		new_nuc$llr_score = llr_score
376
	  if (llr_score < llr_thres) {
369 377
      # aggregate to current cluster
370 378
      #   update bound
371 379
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
......
417 425
    # store old cluster
418 426
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
419 427
  }
420
	return(list(clusters, lod_scores))
421
### Returns a list of clusterized nucleosomes, and all computed lod scores.
428
	return(list(clusters, llr_scores))
429
### Returns a list of clusterized nucleosomes, and all computed llr scores.
422 430
}, ex=function(){
423 431
	# Dealing with a region of interest
424 432
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
......
470 478
			tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads)
471 479
			tmp_nuc_as_list[["nb_nucs"]] = length(tmp_nuc$nucs)
472 480
			if (tmp_nuc$wp) {
473
				tmp_nuc_as_list[["lod_1"]] = signif(tmp_nuc$nucs[[2]]$lod_score,5)
474
				tmp_nuc_as_list[["lod_2"]] = signif(tmp_nuc$nucs[[3]]$lod_score,5)
481
				tmp_nuc_as_list[["llr_1"]] = signif(tmp_nuc$nucs[[2]]$llr_score,5)
482
				tmp_nuc_as_list[["llr_2"]] = signif(tmp_nuc$nucs[[3]]$llr_score,5)
475 483
			} else {
476
				tmp_nuc_as_list[["lod_1"]] = NA
477
				tmp_nuc_as_list[["lod_2"]] = NA
484
				tmp_nuc_as_list[["llr_1"]] = NA
485
				tmp_nuc_as_list[["llr_2"]] = NA
478 486
			}
479 487
      return(tmp_nuc_as_list)
480 488
    })
......
490 498
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's null this list will be computed.
491 499
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's null this list will be computed.
492 500
corr_thres=0.5, ##<< Correlation threshold.
493
lod_thres=100, ##<< LOD cut off.
501
llr_thres=100, ##<< LOD cut off.
494 502
config=NULL, ##<< GLOBAL config variable
495 503
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.
496 504
) {
......
500 508
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")
501 509
	}
502 510
	common_nuc = NULL
503
	lod_scores = c()
511
	llr_scores = c()
504 512
	chr = replicates[[1]][[1]]$roi$chr
505 513
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
506 514
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
......
571 579
								# tranlation of reads into strain 2 coords
572 580
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
573 581
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
574
								lod_score = lod_score_vecs(reads_strain_ref1, reads_strain_ref2)
575
								lod_scores = c(lod_scores, lod_score)
582
								llr_score = llr_score_nvecs(list(reads_strain_ref1, reads_strain_ref2))
583
								llr_scores = c(llr_scores, llr_score)
576 584
								# Filtering on LOD Score
577
								if (lod_score < lod_thres) {
585
								if (llr_score < llr_thres) {
578 586
									tmp_nuc = list()
579 587
									# strain_ref1
580 588
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
......
599 607
									# tmp_nuc[[paste("corr2_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[2]]$corr,5)
600 608
									# tmp_nuc[[paste("corr3_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[3]]$corr,5)
601 609
									# common
602
									tmp_nuc[["lod_score"]] = signif(lod_score,5)
610
									tmp_nuc[["llr_score"]] = signif(llr_score,5)
603 611
									# print(tmp_nuc)
604 612
									common_nuc = dfadd(common_nuc, tmp_nuc)
605 613
								}
......
640 648
			common_nuc = common_nuc[-to_remove_list,]
641 649
		}
642 650

  
643
		return(list(common_nuc, lod_scores))
651
		return(list(common_nuc, llr_scores))
644 652
	} else {
645 653
		print("WARNING, no nucs for strain_ref1.")
646 654
		return(NULL)
647 655
	}
648
### Returns a list of clusterized nucleosomes, and all computed lod scores.
656
### Returns a list of clusterized nucleosomes, and all computed llr scores.
649 657
}, ex=function(){
650 658

  
651 659
    # Define new translate_cur function...
......
2028 2036
        tmp_x_next = tmp_x[-1]
2029 2037
        need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50})
2030 2038
        tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift
2031
        tmp_lod_inter =signif(unlist(tf_nucs$lod_score)[-1], 2)
2039
        tmp_llr_inter =signif(unlist(tf_nucs$llr_score)[-1], 2)
2032 2040
        new_tmp_x = c()
2033 2041
        new_tmp_y = c()
2034 2042
        index_odd = 1:length(tmp_x) * 2 - 1
......
2046 2054
        } else {
2047 2055
          pos = config$LEGEND_LOD_POS
2048 2056
        }
2049
        col_lod = sapply(tmp_lod_inter, function(lod){if (lod < 20 ) return("green") else return("red")})
2050
        text(tmp_x_inter, tmp_y_inter, tmp_lod_inter, cex=1.5, pos=pos, col=col_lod) 
2057
        col_llr = sapply(tmp_llr_inter, function(llr){if (llr < 20 ) return("green") else return("red")})
2058
        text(tmp_x_inter, tmp_y_inter, tmp_llr_inter, cex=1.5, pos=pos, col=col_llr) 
2051 2059
        
2052 2060
      }
2053 2061

  

Formats disponibles : Unified diff