Révision 7e2d37e1 src/R/nucleominer.R
b/src/R/nucleominer.R | ||
---|---|---|
106 | 106 |
|
107 | 107 |
|
108 | 108 |
|
109 |
lod_score_vecs = structure(function # Likelihood ratio
|
|
110 |
### Compute the likelihood log of two set of value from two models Vs. a unique model.
|
|
109 |
llr_score_nvecs = structure(function # Likelihood ratio
|
|
110 |
### Compute the log likelihood ratio of two or more set of value.
|
|
111 | 111 |
( |
112 |
x ,##<< First vector. |
|
113 |
y ##<< Second vector. |
|
112 |
xs ##<< list of vectors. |
|
114 | 113 |
) { |
115 |
if (length(x) <=1 | length(y) <= 1) { |
|
116 |
return(NA) |
|
117 |
} |
|
118 |
meanX = mean(x) |
|
119 |
sdX = sd(x) |
|
120 |
meanY = mean(y) |
|
121 |
sdY = sd(y) |
|
122 |
meanXY = mean(c(x,y)) |
|
123 |
sdXY = sd(c(x,y)) |
|
124 |
llX = sum(log(dnorm(x,mean=meanX,sd=sdX))) |
|
125 |
llY = sum(log(dnorm(y,mean=meanY,sd=sdY))) |
|
126 |
llXY = sum(log(dnorm(c(x,y),mean=meanXY,sd=sdXY))) |
|
127 |
ratio = llX + llY - llXY |
|
114 |
l = length(xs) |
|
115 |
if (l < 1) { |
|
116 |
return(NA) |
|
117 |
} |
|
118 |
if (l == 1) { |
|
119 |
return(1) |
|
120 |
} |
|
121 |
sumllX = 0 |
|
122 |
for (i in 1:l) { |
|
123 |
x = xs[[i]] |
|
124 |
if (length(x) <= 1) { |
|
125 |
return(NA) |
|
126 |
} |
|
127 |
meanX = mean(x) |
|
128 |
sdX = sd(x) |
|
129 |
llX = sum(log(dnorm(x,mean=meanX,sd=sdX))) |
|
130 |
sumllX = sumllX + llX |
|
131 |
} |
|
132 |
meanXglo = mean(unlist(xs)) |
|
133 |
sdXglo = sd(unlist(xs)) |
|
134 |
llXYZ = sum(log(dnorm(unlist(xs),mean=meanXglo,sd=sdXglo))) |
|
135 |
ratio = sumllX - llXYZ |
|
128 | 136 |
return(ratio) |
129 |
### Returns the likelihood ratio. |
|
137 |
### Returns the log likelihood ratio.
|
|
130 | 138 |
}, ex=function(){ |
131 | 139 |
# LOD score for 2 set of values |
132 | 140 |
mean1=5; sd1=2; card2 = 250 |
... | ... | |
139 | 147 |
lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2) |
140 | 148 |
lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3) |
141 | 149 |
lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4) |
142 |
lod_score_vecs(x1,x2)
|
|
150 |
llr_score_nvecs(list(x1,x2))
|
|
143 | 151 |
}) |
144 | 152 |
|
145 | 153 |
dfadd = structure(function# Adding list to a dataframe. |
... | ... | |
207 | 215 |
} |
208 | 216 |
tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2 |
209 | 217 |
tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2 |
210 |
tf_outputs = tf_outputs[tf_outputs$correlation >= corr_thres,] |
|
211 |
tf_outputs = tf_outputs[order(tf_outputs$correlation,decreasing=TRUE),]
|
|
218 |
tf_outputs = tf_outputs[tf_outputs$correlation.score >= corr_thres,]
|
|
219 |
tf_outputs = tf_outputs[order(tf_outputs$correlation.score, decreasing=TRUE),]
|
|
212 | 220 |
i = 1 |
213 | 221 |
while (i <= length(tf_outputs[,1])) { |
214 |
lb = tf_outputs[i,]$low |
|
215 |
ub = tf_outputs[i,]$up |
|
216 |
tf_outputs = tf_outputs[!(tf_outputs$low <= (ub-ol_bp) & tf_outputs$up > ub) & !(tf_outputs$up >= (lb+ol_bp) & tf_outputs$low < lb),]
|
|
222 |
lb = tf_outputs[i,]$lower_bound
|
|
223 |
ub = tf_outputs[i,]$upper_bound
|
|
224 |
tf_outputs = tf_outputs[!(tf_outputs$lower_bound <= (ub-ol_bp) & tf_outputs$upper_bound > ub) & !(tf_outputs$upper_bound >= (lb+ol_bp) & tf_outputs$lower_bound < lb),]
|
|
217 | 225 |
i = i+1 |
218 | 226 |
} |
219 | 227 |
return(tf_outputs) |
... | ... | |
265 | 273 |
|
266 | 274 |
|
267 | 275 |
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes. |
268 |
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{lod_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
|
|
276 |
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{llr_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
|
|
269 | 277 |
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=..., end=..., chr=..., genome=...)}. |
270 |
lod_thres=20, ##<< Log likelihood ration threshold.
|
|
278 |
llr_thres=20, ##<< Log likelihood ration threshold.
|
|
271 | 279 |
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound. |
272 | 280 |
){ |
273 | 281 |
end_of_tracks = function(tracks) { |
... | ... | |
301 | 309 |
return(clusters) |
302 | 310 |
} |
303 | 311 |
strain = samples[[1]]$strain |
304 |
lod_scores = c()
|
|
312 |
llr_scores = c()
|
|
305 | 313 |
min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end) |
306 | 314 |
max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end) |
307 | 315 |
# compute clusters |
... | ... | |
311 | 319 |
indexes = c() |
312 | 320 |
track_readers = c() |
313 | 321 |
current_nuc = NULL |
314 |
lod_score = lod_thres + 1
|
|
322 |
llr_score = llr_thres + 1
|
|
315 | 323 |
# Read nucs from TF outputs |
316 | 324 |
tf_outs = list() |
317 | 325 |
i = 1 |
... | ... | |
356 | 364 |
new_upper_bound = new_nuc$upper_bound |
357 | 365 |
|
358 | 366 |
if (!is.null(current_nuc)) { |
359 |
lod_score = lod_score_vecs(current_nuc$original_reads,new_nuc$original_reads)
|
|
360 |
lod_scores = c(lod_scores,lod_score)
|
|
367 |
llr_score = llr_score_nvecs(list(current_nuc$original_reads,new_nuc$original_reads))
|
|
368 |
llr_scores = c(llr_scores,llr_score)
|
|
361 | 369 |
} |
362 |
# print(paste(lod_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
|
|
363 |
if (is.na(lod_score)) {
|
|
364 |
lod_score = lod_thres + 1
|
|
370 |
# print(paste(llr_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
|
|
371 |
if (is.na(llr_score)) {
|
|
372 |
llr_score = llr_thres + 1
|
|
365 | 373 |
} |
366 |
# Store lod_score
|
|
367 |
new_nuc$lod_score = lod_score
|
|
368 |
if (lod_score < lod_thres) {
|
|
374 |
# Store llr_score
|
|
375 |
new_nuc$llr_score = llr_score
|
|
376 |
if (llr_score < llr_thres) {
|
|
369 | 377 |
# aggregate to current cluster |
370 | 378 |
# update bound |
371 | 379 |
if (new_nuc$upper_bound > new_cluster$upper_bound) { |
... | ... | |
417 | 425 |
# store old cluster |
418 | 426 |
clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center) |
419 | 427 |
} |
420 |
return(list(clusters, lod_scores))
|
|
421 |
### Returns a list of clusterized nucleosomes, and all computed lod scores.
|
|
428 |
return(list(clusters, llr_scores))
|
|
429 |
### Returns a list of clusterized nucleosomes, and all computed llr scores.
|
|
422 | 430 |
}, ex=function(){ |
423 | 431 |
# Dealing with a region of interest |
424 | 432 |
roi =list(name="example", begin=1000, end=1300, chr="1", genome=rep("A",301)) |
... | ... | |
470 | 478 |
tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads) |
471 | 479 |
tmp_nuc_as_list[["nb_nucs"]] = length(tmp_nuc$nucs) |
472 | 480 |
if (tmp_nuc$wp) { |
473 |
tmp_nuc_as_list[["lod_1"]] = signif(tmp_nuc$nucs[[2]]$lod_score,5)
|
|
474 |
tmp_nuc_as_list[["lod_2"]] = signif(tmp_nuc$nucs[[3]]$lod_score,5)
|
|
481 |
tmp_nuc_as_list[["llr_1"]] = signif(tmp_nuc$nucs[[2]]$llr_score,5)
|
|
482 |
tmp_nuc_as_list[["llr_2"]] = signif(tmp_nuc$nucs[[3]]$llr_score,5)
|
|
475 | 483 |
} else { |
476 |
tmp_nuc_as_list[["lod_1"]] = NA
|
|
477 |
tmp_nuc_as_list[["lod_2"]] = NA
|
|
484 |
tmp_nuc_as_list[["llr_1"]] = NA
|
|
485 |
tmp_nuc_as_list[["llr_2"]] = NA
|
|
478 | 486 |
} |
479 | 487 |
return(tmp_nuc_as_list) |
480 | 488 |
}) |
... | ... | |
490 | 498 |
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's null this list will be computed. |
491 | 499 |
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's null this list will be computed. |
492 | 500 |
corr_thres=0.5, ##<< Correlation threshold. |
493 |
lod_thres=100, ##<< LOD cut off.
|
|
501 |
llr_thres=100, ##<< LOD cut off.
|
|
494 | 502 |
config=NULL, ##<< GLOBAL config variable |
495 | 503 |
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed. |
496 | 504 |
) { |
... | ... | |
500 | 508 |
print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.") |
501 | 509 |
} |
502 | 510 |
common_nuc = NULL |
503 |
lod_scores = c()
|
|
511 |
llr_scores = c()
|
|
504 | 512 |
chr = replicates[[1]][[1]]$roi$chr |
505 | 513 |
min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end) |
506 | 514 |
max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end) |
... | ... | |
571 | 579 |
# tranlation of reads into strain 2 coords |
572 | 580 |
diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2 |
573 | 581 |
reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1)) |
574 |
lod_score = lod_score_vecs(reads_strain_ref1, reads_strain_ref2)
|
|
575 |
lod_scores = c(lod_scores, lod_score)
|
|
582 |
llr_score = llr_score_nvecs(list(reads_strain_ref1, reads_strain_ref2))
|
|
583 |
llr_scores = c(llr_scores, llr_score)
|
|
576 | 584 |
# Filtering on LOD Score |
577 |
if (lod_score < lod_thres) {
|
|
585 |
if (llr_score < llr_thres) {
|
|
578 | 586 |
tmp_nuc = list() |
579 | 587 |
# strain_ref1 |
580 | 588 |
tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr |
... | ... | |
599 | 607 |
# tmp_nuc[[paste("corr2_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[2]]$corr,5) |
600 | 608 |
# tmp_nuc[[paste("corr3_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[3]]$corr,5) |
601 | 609 |
# common |
602 |
tmp_nuc[["lod_score"]] = signif(lod_score,5)
|
|
610 |
tmp_nuc[["llr_score"]] = signif(llr_score,5)
|
|
603 | 611 |
# print(tmp_nuc) |
604 | 612 |
common_nuc = dfadd(common_nuc, tmp_nuc) |
605 | 613 |
} |
... | ... | |
640 | 648 |
common_nuc = common_nuc[-to_remove_list,] |
641 | 649 |
} |
642 | 650 |
|
643 |
return(list(common_nuc, lod_scores))
|
|
651 |
return(list(common_nuc, llr_scores))
|
|
644 | 652 |
} else { |
645 | 653 |
print("WARNING, no nucs for strain_ref1.") |
646 | 654 |
return(NULL) |
647 | 655 |
} |
648 |
### Returns a list of clusterized nucleosomes, and all computed lod scores.
|
|
656 |
### Returns a list of clusterized nucleosomes, and all computed llr scores.
|
|
649 | 657 |
}, ex=function(){ |
650 | 658 |
|
651 | 659 |
# Define new translate_cur function... |
... | ... | |
2028 | 2036 |
tmp_x_next = tmp_x[-1] |
2029 | 2037 |
need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50}) |
2030 | 2038 |
tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift |
2031 |
tmp_lod_inter =signif(unlist(tf_nucs$lod_score)[-1], 2)
|
|
2039 |
tmp_llr_inter =signif(unlist(tf_nucs$llr_score)[-1], 2)
|
|
2032 | 2040 |
new_tmp_x = c() |
2033 | 2041 |
new_tmp_y = c() |
2034 | 2042 |
index_odd = 1:length(tmp_x) * 2 - 1 |
... | ... | |
2046 | 2054 |
} else { |
2047 | 2055 |
pos = config$LEGEND_LOD_POS |
2048 | 2056 |
} |
2049 |
col_lod = sapply(tmp_lod_inter, function(lod){if (lod < 20 ) return("green") else return("red")})
|
|
2050 |
text(tmp_x_inter, tmp_y_inter, tmp_lod_inter, cex=1.5, pos=pos, col=col_lod)
|
|
2057 |
col_llr = sapply(tmp_llr_inter, function(llr){if (llr < 20 ) return("green") else return("red")})
|
|
2058 |
text(tmp_x_inter, tmp_y_inter, tmp_llr_inter, cex=1.5, pos=pos, col=col_llr)
|
|
2051 | 2059 |
|
2052 | 2060 |
} |
2053 | 2061 |
|
Formats disponibles : Unified diff