root / src / R / nucleominer.R @ 3bfae55d
Historique | Voir | Annoter | Télécharger (87,19 ko)
1 |
FDR = structure(function# False Discovery Rate |
---|---|
2 |
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper |
3 |
##author<< Gael Yvert, |
4 |
( |
5 |
x, ##<< A vector x of independent p-values. |
6 |
FDR ##<< The specified FDR. |
7 |
) { |
8 |
x <- sort(na.omit(x)) |
9 |
N = length(x) |
10 |
i = 1; |
11 |
while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR |
12 |
if (i == 1) |
13 |
return (NA) |
14 |
else |
15 |
return( x[i-1] ) |
16 |
### Return the the corresponding cutoff. |
17 |
}, ex=function(){ |
18 |
print("example") |
19 |
}) |
20 |
|
21 |
|
22 |
|
23 |
llr_score_nvecs = structure(function # Likelihood ratio |
24 |
### Compute the log likelihood ratio of two or more set of value. |
25 |
( |
26 |
xs ##<< list of vectors. |
27 |
) { |
28 |
l = length(xs) |
29 |
if (l < 1) { |
30 |
return(NA) |
31 |
} |
32 |
if (l == 1) { |
33 |
return(1) |
34 |
} |
35 |
sumllX = 0 |
36 |
for (i in 1:l) { |
37 |
x = xs[[i]] |
38 |
if (length(x) <= 1) { |
39 |
return(NA) |
40 |
} |
41 |
meanX = mean(x) |
42 |
sdX = sd(x) |
43 |
llX = sum(log(dnorm(x,mean=meanX,sd=sdX))) |
44 |
sumllX = sumllX + llX |
45 |
} |
46 |
meanXglo = mean(unlist(xs)) |
47 |
sdXglo = sd(unlist(xs)) |
48 |
llXYZ = sum(log(dnorm(unlist(xs),mean=meanXglo,sd=sdXglo))) |
49 |
ratio = sumllX - llXYZ |
50 |
return(ratio) |
51 |
### Returns the log likelihood ratio. |
52 |
}, ex=function(){ |
53 |
# LLR score for 2 set of values |
54 |
mean1=5; sd1=2; card2 = 250 |
55 |
mean2=6; sd2=3; card1 = 200 |
56 |
x1 = rnorm(card1, mean1, sd1) |
57 |
x2 = rnorm(card2, mean2, sd2) |
58 |
min = floor(min(c(x1,x2))) |
59 |
max = ceiling(max(c(x1,x2))) |
60 |
hist(c(x1,x2), xlim=c(min, max), breaks=min:max) |
61 |
lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2) |
62 |
lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3) |
63 |
lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4) |
64 |
llr_score_nvecs(list(x1,x2)) |
65 |
}) |
66 |
|
67 |
dfadd = structure(function# Adding list to a dataframe. |
68 |
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}. |
69 |
(df, ##<< A dataframe |
70 |
l ##<< A list |
71 |
) { |
72 |
if (is.null(df)) { |
73 |
df = data.frame(l,stringsAsFactors=FALSE) |
74 |
} else { |
75 |
df = rbind(df, data.frame(l,stringsAsFactors=FALSE)) |
76 |
} |
77 |
return(df) |
78 |
### Return the dataframe \emph{df}. |
79 |
}, ex=function(){ |
80 |
## Here dataframe is NULL |
81 |
print(df) |
82 |
df = NULL |
83 |
|
84 |
# Initialize df |
85 |
df = dfadd(df, list(key1 = "value1", key2 = "value2")) |
86 |
print(df) |
87 |
|
88 |
# Adding elements to df |
89 |
df = dfadd(df, list(key1 = "value1'", key2 = "value2'")) |
90 |
print(df) |
91 |
}) |
92 |
|
93 |
|
94 |
sign_from_strand = function( |
95 |
### Get the sign of strand |
96 |
strands) { |
97 |
apply(t(strands), 2, function(strand) { if (strand == "F") return(1) else return(-1)}) |
98 |
### If strand in forward then returns 1 else returns -1 |
99 |
} |
100 |
|
101 |
flat_reads = function( |
102 |
### Extract reads coordinates from TempleteFilter input sequence |
103 |
reads, ##<< TemplateFilter input reads |
104 |
nuc_width ##<< Width used to shift F and R reads. |
105 |
) { |
106 |
F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])})) |
107 |
R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])})) |
108 |
flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads)) ) |
109 |
return(list(F_flatted_reads, R_flatted_reads, flatted_reads)) |
110 |
### Returns a list of F reads, R reads and joint/shifted F and R reads. |
111 |
} |
112 |
|
113 |
|
114 |
|
115 |
filter_tf_outputs = function(# Filter TemplateFilter outputs |
116 |
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlapping threshold. |
117 |
tf_outputs, ##<< TemplateFilter outputs. |
118 |
chr, ##<< Chromosome observed, here chr is an integer. |
119 |
x_min, ##<< Coordinate of the first bp observed. |
120 |
x_max, ##<< Coordinate of the last bp observed. |
121 |
nuc_width = 160, ##<< Nucleosome width. |
122 |
ol_bp = 59, ##<< Overlap Threshold. |
123 |
corr_thres = 0.5 ##<< Correlation threshold. |
124 |
) { |
125 |
if (x_min < 0) { |
126 |
tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= -x_max & tf_outputs$center + tf_outputs$width/2 <= -x_min,] |
127 |
} else { |
128 |
tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= x_min & tf_outputs$center + tf_outputs$width/2 <= x_max,] |
129 |
} |
130 |
tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2 |
131 |
tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2 |
132 |
tf_outputs = tf_outputs[tf_outputs$correlation.score >= corr_thres,] |
133 |
tf_outputs = tf_outputs[order(tf_outputs$correlation.score, decreasing=TRUE),] |
134 |
i = 1 |
135 |
while (i <= length(tf_outputs[,1])) { |
136 |
lb = tf_outputs[i,]$lower_bound |
137 |
ub = tf_outputs[i,]$upper_bound |
138 |
tf_outputs = tf_outputs[!(tf_outputs$lower_bound <= (ub-ol_bp) & tf_outputs$upper_bound > ub) & !(tf_outputs$upper_bound >= (lb+ol_bp) & tf_outputs$lower_bound < lb),] |
139 |
i = i+1 |
140 |
} |
141 |
return(tf_outputs) |
142 |
### Returns filtered TemplateFilter Outputs |
143 |
} |
144 |
|
145 |
|
146 |
|
147 |
filter_tf_inputs = function(# Filter TemplateFilter inputs |
148 |
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads. |
149 |
inputs, ##<< TF inputs to be filtered. |
150 |
chr, ##<< Chromosome observed, here chr is an integer. |
151 |
x_min, ##<< Coordinate of the first bp observed. |
152 |
x_max, ##<< Coordinate of the last bp observed. |
153 |
nuc_width = 160, ##<< Nucleosome width. |
154 |
only_f = FALSE, ##<< Filter only F reads. |
155 |
only_r = FALSE, ##<< Filter only R reads. |
156 |
filter_for_coverage = FALSE ##<< Does it filter for plot coverage? |
157 |
) { |
158 |
if (only_f) { |
159 |
inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,] |
160 |
} else if (only_r) { |
161 |
inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,] |
162 |
} else { |
163 |
inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,] |
164 |
} |
165 |
if (!filter_for_coverage) { |
166 |
corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3]) |
167 |
inputs = inputs[inputs[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,] |
168 |
} |
169 |
return(inputs) |
170 |
### Returns filtred inputs. |
171 |
} |
172 |
|
173 |
|
174 |
|
175 |
# filter_tf_inputs = function(# Filter TemplateFilter inputs |
176 |
# ### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads. |
177 |
# inputs, ##<< TF inputs to be filtered. |
178 |
# chr, ##<< Chromosome observed, here chr is an integer. |
179 |
# x_min, ##<< Coordinate of the first bp observed. |
180 |
# x_max, ##<< Coordinate of the last bp observed. |
181 |
# nuc_width = 160, ##<< Nucleosome width. |
182 |
# only_f = FALSE, ##<< Filter only F reads. |
183 |
# only_r = FALSE, ##<< Filter only R reads. |
184 |
# filter_for_coverage = FALSE, ##<< Does it filter for plot coverage? |
185 |
# USE_DPLYR = FALSE ##<< Use dplyr lib to filter reads. |
186 |
# ) { |
187 |
# n = names(inputs) |
188 |
# |
189 |
# if (!USE_DPLYR) { |
190 |
# if (only_f) { |
191 |
# inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,] |
192 |
# } else if (only_r) { |
193 |
# inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,] |
194 |
# } else { |
195 |
# inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,] |
196 |
# } |
197 |
# } else { |
198 |
# names(inputs) = c("chr", "pos", "str", "lev") |
199 |
# if (only_f) { |
200 |
# inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, str == "F", pos <= x_max + nuc_width) |
201 |
# } else if (only_r) { |
202 |
# inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, str == "R" & pos <= x_max + nuc_width) |
203 |
# } else { |
204 |
# inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, pos <= x_max + nuc_width) |
205 |
# } |
206 |
# # if (!filter_for_coverage) { |
207 |
# # inputs$corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3]) |
208 |
# # inputs = filter(inputs, chr == chr, corrected_inputs_coords >= x_min, corrected_inputs_coords <= x_max) |
209 |
# # inputs$corrected_inputs_coords = NULL |
210 |
# # } |
211 |
# } |
212 |
# |
213 |
# if (!filter_for_coverage) { |
214 |
# corrected_inputs_coords = inputs_out[,2] + nuc_width/2 * sign_from_strand(inputs_out[,3]) |
215 |
# inputs_out = inputs_out[inputs_out[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,] |
216 |
# } |
217 |
# |
218 |
# names(inputs_out) = n |
219 |
# return(inputs_out) |
220 |
# ### Returns filtred inputs. |
221 |
# } |
222 |
|
223 |
|
224 |
|
225 |
get_comp_strand = function( |
226 |
### Compute the complementatry strand. |
227 |
strand ##<< The original strand. |
228 |
) { |
229 |
apply(t(strand),2, function(n){ |
230 |
if (n=="a") {return("t")} |
231 |
if (n=="t") {return("a")} |
232 |
if (n=="c") {return("g")} |
233 |
if (n=="g") {return("c")} |
234 |
}) |
235 |
### Returns the complementatry strand. |
236 |
} |
237 |
|
238 |
|
239 |
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes. |
240 |
### This function aggregates nucleosomes from replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center (dyad). A chain of nucleosomes is builts across all replicates. Adjacent nucleosomes of the chain are compared two by two. Comparison is based on a log likelihood ratio (LLR1). depending on the LLR1 value nucleosomes are merged (low LLR) or separated (high LLR). Finally the function returns a list of clusters and all computed llr_scores. Each cluster ows an attribute wp for "well positioned". This attribute is set to TRUE if the cluster is composed of exactly one nucleosome of each sample. |
241 |
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=..., end=..., chr=..., genome=...)}. |
242 |
llr_thres=20, ##<< Log likelihood ratio threshold to decide between merging and separating |
243 |
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound. |
244 |
){ |
245 |
end_of_tracks = function(tracks) { |
246 |
if (length(tracks) == 0) { |
247 |
return(TRUE) |
248 |
} |
249 |
for (lower_bound in tracks) { |
250 |
if(!is.na(lower_bound)) { |
251 |
if (lower_bound < coord_max) { |
252 |
return(FALSE) |
253 |
} |
254 |
} |
255 |
} |
256 |
return(TRUE) |
257 |
} |
258 |
store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center) { |
259 |
if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) { |
260 |
new_cluster$wp = TRUE |
261 |
center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2 |
262 |
if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) { |
263 |
clusters[[length(clusters) + 1]] = new_cluster |
264 |
# print(new_cluster) |
265 |
} |
266 |
} else { |
267 |
new_cluster$wp = FALSE |
268 |
center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2 |
269 |
if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) { |
270 |
clusters[[length(clusters) + 1]] = new_cluster |
271 |
} |
272 |
} |
273 |
return(clusters) |
274 |
} |
275 |
strain = samples[[1]]$strain |
276 |
llr_scores = c() |
277 |
min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end) |
278 |
max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end) |
279 |
# compute clusters |
280 |
clusters = list() |
281 |
cluster_contents = list() |
282 |
# Init reader |
283 |
indexes = c() |
284 |
track_readers = c() |
285 |
current_nuc = NULL |
286 |
llr_score = llr_thres + 1 |
287 |
# Read nucs from TF outputs |
288 |
tf_outs = list() |
289 |
i = 1 |
290 |
for (sample in samples) { |
291 |
tf_outs[[i]] = sample$outputs |
292 |
tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),] |
293 |
indexes[i] = 1 |
294 |
if (is.na(tf_outs[[i]][indexes[i],]$center)) { |
295 |
track_readers[i] = coord_max |
296 |
} else { |
297 |
track_readers[i] = tf_outs[[i]][indexes[i],]$center |
298 |
} |
299 |
i = i+1 |
300 |
} |
301 |
nb_tracks = length(tf_outs) |
302 |
# print(track_readers) |
303 |
new_cluster = NULL |
304 |
nb_nucs_in_cluster = 0 |
305 |
nuc_from_track = c() |
306 |
for (i in 1:nb_tracks){ |
307 |
nuc_from_track[i] = FALSE |
308 |
} |
309 |
# Start clustering |
310 |
while (!end_of_tracks(track_readers)) { |
311 |
new_center = min(track_readers) |
312 |
current_track = which(track_readers == new_center)[1] |
313 |
new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],]) |
314 |
new_nuc$chr = substr(new_nuc$chr,4,1000000L) |
315 |
new_nuc$inputs = samples[[current_track]]$inputs |
316 |
new_nuc$chr = samples[[current_track]]$roi$chr |
317 |
new_nuc$track = current_track |
318 |
|
319 |
new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width) |
320 |
flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width) |
321 |
new_nuc$original_reads = flatted_reads[[3]] |
322 |
|
323 |
new_upper_bound = new_nuc$upper_bound |
324 |
|
325 |
if (!is.null(current_nuc)) { |
326 |
llr_score = llr_score_nvecs(list(current_nuc$original_reads,new_nuc$original_reads)) |
327 |
llr_scores = c(llr_scores,llr_score) |
328 |
} |
329 |
# print(paste(llr_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" ")) |
330 |
if (is.na(llr_score)) { |
331 |
llr_score = llr_thres + 1 |
332 |
} |
333 |
# Store llr_score |
334 |
new_nuc$llr_score = llr_score |
335 |
if (llr_score < llr_thres) { |
336 |
# aggregate to current cluster |
337 |
# update bound |
338 |
if (new_nuc$upper_bound > new_cluster$upper_bound) { |
339 |
new_cluster$upper_bound = new_nuc$upper_bound |
340 |
} |
341 |
if (new_nuc$lower_bound < new_cluster$lower_bound) { |
342 |
new_cluster$lower_bound = new_nuc$lower_bound |
343 |
} |
344 |
# add nucleosome to current cluster |
345 |
nuc_from_track[current_track] = TRUE |
346 |
nb_nucs_in_cluster = nb_nucs_in_cluster + 1 |
347 |
new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc |
348 |
} else { |
349 |
if (!is.null(new_cluster)) { |
350 |
# store old cluster |
351 |
clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center) |
352 |
} |
353 |
# Reinit current cluster composition stuff |
354 |
nb_nucs_in_cluster = 0 |
355 |
nuc_from_track = c() |
356 |
for (i in 1:nb_tracks){ |
357 |
nuc_from_track[i] = FALSE |
358 |
} |
359 |
# create new cluster |
360 |
new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list()) |
361 |
# update upper bound |
362 |
current_upper_bound = new_upper_bound |
363 |
# add nucleosome to current cluster |
364 |
nb_nucs_in_cluster = nb_nucs_in_cluster + 1 |
365 |
nuc_from_track[current_track] = TRUE |
366 |
new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc |
367 |
|
368 |
} |
369 |
|
370 |
current_nuc = new_nuc |
371 |
|
372 |
# update indexes |
373 |
if (indexes[current_track] < length(tf_outs[[current_track]]$center)) { |
374 |
indexes[current_track] = indexes[current_track] + 1 |
375 |
# update track |
376 |
track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center |
377 |
} else { |
378 |
# update track |
379 |
track_readers[current_track] = coord_max |
380 |
} |
381 |
} |
382 |
# store last cluster |
383 |
if (!is.null(new_cluster)) { |
384 |
# store old cluster |
385 |
clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center) |
386 |
} |
387 |
return(list(clusters, llr_scores)) |
388 |
### Returns a list of clusterized nucleosomes, and all computed llr scores. |
389 |
}, ex=function(){ |
390 |
# Dealing with a region of interest |
391 |
roi =list(name="example", begin=1000, end=1300, chr="1", genome=rep("A",301)) |
392 |
samples = list() |
393 |
for (i in 1:3) { |
394 |
# Create TF output |
395 |
tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9) |
396 |
outputs = dfadd(NULL,tf_nuc) |
397 |
outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end) |
398 |
# Generate corresponding reads |
399 |
nb_reads = round(runif(1,170,230)) |
400 |
reads = round(rnorm(nb_reads, tf_nuc$center,20)) |
401 |
u_reads = sort(unique(reads)) |
402 |
strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2)))) |
403 |
counts = apply(t(u_reads), 2, function(r) { sum(reads == r)}) |
404 |
shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)}) |
405 |
u_reads = u_reads + shifts |
406 |
inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)), |
407 |
"V2" = u_reads, |
408 |
"V3" = strands, |
409 |
"V4" = counts), stringsAsFactors=FALSE) |
410 |
samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs) |
411 |
} |
412 |
print(aggregate_intra_strain_nucs(samples)) |
413 |
}) |
414 |
|
415 |
flat_aggregated_intra_strain_nucs = function(# to flat aggregate_intra_strain_nucs function output |
416 |
### This function builds a dataframe of all clusters obtain from aggregate_intra_strain_nucs function. |
417 |
partial_strain_maps, ##<< the output of aggregate_intra_strain_nucs function |
418 |
cur_index, ##<< the index of the roi involved |
419 |
nb_tracks=3 ##<< the number of replicates |
420 |
) { |
421 |
if (length(partial_strain_maps) == 0 ){ |
422 |
print(paste("Empty partial_strain_maps for roi", cur_index, "ands current strain." )) |
423 |
tmp_strain_maps = list() |
424 |
} else { |
425 |
tmp_strain_map = apply(t(1:length(partial_strain_maps)), 2, function(i){ |
426 |
tmp_nuc = partial_strain_maps[[i]] |
427 |
tmp_nuc_as_list = list() |
428 |
tmp_nuc_as_list[["chr"]] = tmp_nuc[["chr"]] |
429 |
tmp_nuc_as_list[["lower_bound"]] = ceiling(tmp_nuc[["lower_bound"]]) |
430 |
tmp_nuc_as_list[["upper_bound"]] = floor(tmp_nuc[["upper_bound"]]) |
431 |
tmp_nuc_as_list[["cur_index"]] = cur_index |
432 |
tmp_nuc_as_list[["index_nuc"]] = i |
433 |
tmp_nuc_as_list[["wp"]] = as.integer(tmp_nuc$wp) |
434 |
all_original_reads = c() |
435 |
for (j in 1:length(tmp_nuc$nucs)) { |
436 |
all_original_reads = c(all_original_reads, tmp_nuc$nucs[[j]]$original_reads) |
437 |
} |
438 |
tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads) |
439 |
tmp_nuc_as_list[["nb_nucs"]] = length(tmp_nuc$nucs) |
440 |
if (tmp_nuc$wp) { |
441 |
for (i in 1:(nb_tracks-1)) { |
442 |
tmp_nuc_as_list[[paste("llr", i, sep="_")]] = signif(tmp_nuc$nucs[[i + 1]]$llr_score,5) |
443 |
} |
444 |
} else { |
445 |
for (i in 1:(nb_tracks-1)) { |
446 |
tmp_nuc_as_list[[paste("llr", i, sep="_")]] = NA |
447 |
} |
448 |
} |
449 |
return(tmp_nuc_as_list) |
450 |
}) |
451 |
tmp_strain_maps = do.call("rbind", tmp_strain_map) |
452 |
} |
453 |
return(data.frame(lapply(data.frame(tmp_strain_maps, stringsAsFactors=FALSE), unlist), stringsAsFactors=FALSE)) |
454 |
### Returns a dataframe of all clusters obtain from aggregate_intra_strain_nucs function. |
455 |
} |
456 |
|
457 |
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains. |
458 |
### This function aligns nucleosomes between two strains for a given genome region. |
459 |
replicates, ##<< Set of replicates, ideally 3 per strain. |
460 |
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's NULL this list will be computed. |
461 |
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's NULL this list will be computed. |
462 |
corr_thres=0.5, ##<< Correlation threshold. |
463 |
llr_thres=100, ##<< Log likelihood ratio threshold to decide between merging and separating |
464 |
config=NULL, ##<< GLOBAL config variable |
465 |
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed. |
466 |
) { |
467 |
if (length(replicates) < 2) { |
468 |
stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.") |
469 |
} else if (length(replicates) > 2) { |
470 |
print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.") |
471 |
} |
472 |
common_nuc = NULL |
473 |
llr_scores = c() |
474 |
chr = replicates[[1]][[1]]$roi$chr |
475 |
min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end) |
476 |
max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end) |
477 |
strain_ref1 = replicates[[1]][[1]]$strain |
478 |
strain_ref2 = replicates[[2]][[1]]$strain |
479 |
big_cur = replicates[[1]][[1]]$roi |
480 |
orig_big_cur = replicates[[1]][[1]]$orig_roi |
481 |
if(big_cur$end - big_cur$begin < 0) { |
482 |
tmp_begin = big_cur$begin |
483 |
big_cur$begin = big_cur$end |
484 |
big_cur$end = tmp_begin |
485 |
} |
486 |
# GO! |
487 |
if (is.null(wp_nucs_strain_ref1)) { |
488 |
wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]] |
489 |
} |
490 |
if (is.null(wp_nucs_strain_ref2)) { |
491 |
wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]] |
492 |
} |
493 |
lws = c() |
494 |
ups = c() |
495 |
for (na in wp_nucs_strain_ref2) { |
496 |
lws = c(lws, na$lower_bound) |
497 |
ups = c(ups, na$upper_bound) |
498 |
} |
499 |
|
500 |
print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep="")) |
501 |
roi_strain_ref1 = NULL |
502 |
roi_strain_ref2 = NULL |
503 |
if (length(wp_nucs_strain_ref1) > 0) { |
504 |
for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){ |
505 |
# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep="")) |
506 |
nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]] |
507 |
# Filtering on Well Positionned |
508 |
if (nuc_strain_ref1$wp) { |
509 |
roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1) |
510 |
roi_strain_ref2 = translate_cur(roi_strain_ref1, strain_ref2, big_cur=orig_big_cur, config=config) |
511 |
if (!is.null(roi_strain_ref2)){ |
512 |
# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER) |
513 |
lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin) |
514 |
upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin) |
515 |
matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2) |
516 |
for (index_nuc_strain_ref2 in matching_nas) { |
517 |
nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]] |
518 |
# Filtering on Well Positionned |
519 |
nuc_strain_ref2_to_roi = list(begin=nuc_strain_ref2$lower_bound, end=nuc_strain_ref2$upper_bound, chr=nuc_strain_ref2$chr, strain_ref = strain_ref2) |
520 |
if (!is.null(translate_cur(nuc_strain_ref2_to_roi, strain_ref1, big_cur=orig_big_cur, config=config)) & |
521 |
nuc_strain_ref2$wp) { |
522 |
# Filtering on correlation Score and collecting reads |
523 |
SKIP = FALSE |
524 |
# TODO: This for loop could be done before working on strain_ref2. Isn't it? |
525 |
reads_strain_ref1 = c() |
526 |
for (nuc in nuc_strain_ref1$nucs){ |
527 |
reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads) |
528 |
if (nuc$corr < corr_thres) { |
529 |
SKIP = TRUE |
530 |
} |
531 |
} |
532 |
reads_strain_ref2 = c() |
533 |
for (nuc in nuc_strain_ref2$nucs){ |
534 |
reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads) |
535 |
if (nuc$corr < corr_thres) { |
536 |
SKIP = TRUE |
537 |
} |
538 |
} |
539 |
# Filtering on correlation Score |
540 |
if (!SKIP) { |
541 |
# tranlation of reads into strain 2 coords |
542 |
diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2 |
543 |
reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1)) |
544 |
llr_score = llr_score_nvecs(list(reads_strain_ref1, reads_strain_ref2)) |
545 |
llr_scores = c(llr_scores, llr_score) |
546 |
# Filtering on LLR Score |
547 |
if (llr_score < llr_thres) { |
548 |
tmp_nuc = list() |
549 |
# strain_ref1 |
550 |
tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr |
551 |
tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound |
552 |
tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound |
553 |
tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5) |
554 |
tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5) |
555 |
tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1) |
556 |
tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1 |
557 |
# strain_ref2 |
558 |
tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr |
559 |
tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound |
560 |
tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound |
561 |
tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5) |
562 |
tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5) |
563 |
tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2) |
564 |
tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2 |
565 |
# common |
566 |
tmp_nuc[["llr_score"]] = signif(llr_score,5) |
567 |
tmp_nuc[["common_wp_pval"]] = signif(1-pchisq(2*tmp_nuc[["llr_score"]], df=2),5) |
568 |
tmp_nuc[["diff"]] = abs(abs((roi_strain_ref2$begin - roi_strain_ref2$end)) / 2 - abs((nuc_strain_ref2$lower_bound - nuc_strain_ref2$upper_bound)) / 2) |
569 |
common_nuc = dfadd(common_nuc, tmp_nuc) |
570 |
} |
571 |
} |
572 |
} |
573 |
} |
574 |
} else { |
575 |
print("WARNING! No roi for strain ref 2.") |
576 |
} |
577 |
} |
578 |
} |
579 |
|
580 |
if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) { |
581 |
index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] == common_nuc[,1:3][-1,] ,1,sum) == 3) |
582 |
to_remove_list = c() |
583 |
for (i in 1:length(index_redundant)) { |
584 |
if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) { |
585 |
to_remove = index_redundant[i] |
586 |
} else { |
587 |
to_remove = index_redundant[i] + 1 |
588 |
} |
589 |
to_remove_list = c(to_remove_list, to_remove) |
590 |
} |
591 |
common_nuc = common_nuc[-to_remove_list,] |
592 |
} |
593 |
|
594 |
if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) { |
595 |
index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] == common_nuc[,8:10][-1,] ,1,sum) == 3) |
596 |
to_remove_list = c() |
597 |
for (i in 1:length(index_redundant)) { |
598 |
if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) { |
599 |
to_remove = index_redundant[i] |
600 |
} else { |
601 |
to_remove = index_redundant[i] + 1 |
602 |
} |
603 |
to_remove_list = c(to_remove_list, to_remove) |
604 |
} |
605 |
common_nuc = common_nuc[-to_remove_list,] |
606 |
} |
607 |
|
608 |
return(list(common_nuc, llr_scores)) |
609 |
} else { |
610 |
print("WARNING, no nucs for strain_ref1.") |
611 |
return(NULL) |
612 |
} |
613 |
### Returns a list of clusterized nucleosomes, and all computed llr scores. |
614 |
}, ex=function(){ |
615 |
|
616 |
# Define new translate_cur function... |
617 |
translate_cur = function(roi, strain2, big_cur=NULL, config=NULL) { |
618 |
return(roi) |
619 |
} |
620 |
# Binding it by uncomment follwing lines. |
621 |
unlockBinding("translate_cur", as.environment("package:nucleominer")) |
622 |
unlockBinding("translate_cur", getNamespace("nucleominer")) |
623 |
assign("translate_cur", translate_cur, "package:nucleominer") |
624 |
assign("translate_cur", translate_cur, getNamespace("nucleominer")) |
625 |
lockBinding("translate_cur", getNamespace("nucleominer")) |
626 |
lockBinding("translate_cur", as.environment("package:nucleominer")) |
627 |
|
628 |
# Dealing with a region of interest |
629 |
roi =list(name="example", begin=1000, end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1") |
630 |
roi2 = translate_cur(roi, roi$strain_ref1) |
631 |
replicates = list() |
632 |
for (j in 1:2) { |
633 |
samples = list() |
634 |
for (i in 1:3) { |
635 |
# Create TF output |
636 |
tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9) |
637 |
outputs = dfadd(NULL,tf_nuc) |
638 |
outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end) |
639 |
# Generate corresponding reads |
640 |
nb_reads = round(runif(1,170,230)) |
641 |
reads = round(rnorm(nb_reads, tf_nuc$center,20)) |
642 |
u_reads = sort(unique(reads)) |
643 |
strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2)))) |
644 |
counts = apply(t(u_reads), 2, function(r) { sum(reads == r)}) |
645 |
shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)}) |
646 |
u_reads = u_reads + shifts |
647 |
inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)), |
648 |
"V2" = u_reads, |
649 |
"V3" = strands, |
650 |
"V4" = counts), stringsAsFactors=FALSE) |
651 |
samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs) |
652 |
} |
653 |
replicates[[length(replicates) + 1]] = samples |
654 |
} |
655 |
print(align_inter_strain_nucs(replicates)) |
656 |
}) |
657 |
|
658 |
|
659 |
|
660 |
|
661 |
|
662 |
|
663 |
|
664 |
|
665 |
|
666 |
|
667 |
|
668 |
|
669 |
|
670 |
|
671 |
|
672 |
|
673 |
|
674 |
|
675 |
|
676 |
|
677 |
|
678 |
|
679 |
|
680 |
|
681 |
|
682 |
|
683 |
fetch_mnase_replicates = function(# Prefetch data |
684 |
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates. |
685 |
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids. |
686 |
roi, ##<< Region of interest. |
687 |
all_samples, ##<< Global list of samples. |
688 |
config=NULL, ##<< GLOBAL config variable |
689 |
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking. |
690 |
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence. |
691 |
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files. |
692 |
) { |
693 |
samples=list() |
694 |
samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id) |
695 |
for (i in samples_ids) { |
696 |
sample = as.list(all_samples[all_samples$id==i,]) |
697 |
sample$orig_roi = roi |
698 |
sample$roi = translate_cur(roi, sample$strain, config = config) |
699 |
if (get_genome) { |
700 |
# Get Genome |
701 |
sample$roi$genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]], "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end] |
702 |
} |
703 |
# Get inputs |
704 |
sample$inputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), "table", stringsAsFactors=FALSE) |
705 |
sample$total_reads = sum(sample$inputs[,4]) |
706 |
if (!only_fetch) { |
707 |
sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300) |
708 |
} |
709 |
# Get TF outputs for Mnase_Seq samples |
710 |
if (sample$marker == "Mnase_Seq" & get_ouputs) { |
711 |
sample$outputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), "table", header=TRUE, sep="\t") |
712 |
if (!only_fetch) { |
713 |
sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300) |
714 |
} |
715 |
} |
716 |
samples[[length(samples) + 1]] = sample |
717 |
} |
718 |
return(samples) |
719 |
} |
720 |
|
721 |
substract_region = function(# Substract to a list of regions an other list of regions that intersect it. |
722 |
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two. |
723 |
region1, ##<< Original regions. |
724 |
region2 ##<< Regions to substract. |
725 |
) { |
726 |
rec_substract_region = function(region1, region2) { |
727 |
non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) { |
728 |
cur_fuzzy = region1[i,] |
729 |
inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,] |
730 |
if (length(inter_wp[,1]) > 0) { |
731 |
ret = c() |
732 |
for (j in 1:length(inter_wp[,1])) { |
733 |
cur_wp = inter_wp[j,] |
734 |
if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) { |
735 |
# remove cur_fuzzy |
736 |
ret = c() |
737 |
break |
738 |
} else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) { |
739 |
# crop fuzzy |
740 |
cur_fuzzy$lower_bound = cur_wp$upper_bound + 1 |
741 |
ret = cur_fuzzy |
742 |
} else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) { |
743 |
# crop fuzzy |
744 |
cur_fuzzy$upper_bound = cur_wp$lower_bound - 1 |
745 |
ret = cur_fuzzy |
746 |
} else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) { |
747 |
# split fuzzy |
748 |
tmp_ret_fuzzy_1 = cur_fuzzy |
749 |
tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1 |
750 |
tmp_ret_fuzzy_2 = cur_fuzzy |
751 |
tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1 |
752 |
ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp) |
753 |
# print(ret) |
754 |
# ret = cur_fuzzy |
755 |
break |
756 |
} else { |
757 |
stop("WARNING NO ADAPTED CASE!") |
758 |
} |
759 |
} |
760 |
return(ret) |
761 |
} else { |
762 |
return(cur_fuzzy) |
763 |
} |
764 |
}) |
765 |
} |
766 |
non_inter_fuzzy = rec_substract_region(region1[,1:4], region2[,1:4]) |
767 |
if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)} |
768 |
tmp_ulist = unlist(non_inter_fuzzy) |
769 |
tmp_names = names(tmp_ulist)[1:4] |
770 |
non_inter_fuzzy = data.frame(matrix(tmp_ulist, ncol=4, byrow=TRUE), stringsAsFactors=FALSE) |
771 |
names(non_inter_fuzzy) = tmp_names |
772 |
non_inter_fuzzy$chr = as.character(non_inter_fuzzy$chr) |
773 |
non_inter_fuzzy$chr = as.numeric(non_inter_fuzzy$chr) |
774 |
non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound) |
775 |
non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound) |
776 |
non_inter_fuzzy = non_inter_fuzzy[order(non_inter_fuzzy$lower_bound),] |
777 |
return(non_inter_fuzzy) |
778 |
} |
779 |
|
780 |
union_regions = function(# Aggregate regions that intersect themselves. |
781 |
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable. |
782 |
regions ##<< The Regions to be aggregated |
783 |
) { |
784 |
if (is.null(regions)) {return(regions)} |
785 |
if (nrow(regions) == 0) {return(regions)} |
786 |
old_length = length(regions[,1]) |
787 |
new_length = 0 |
788 |
|
789 |
while (old_length != new_length) { |
790 |
regions = regions[order(regions$lower_bound), ] |
791 |
regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 1, TRUE) |
792 |
vec_end_1 = which(regions$stop) |
793 |
if (length(vec_end_1) == 0) { |
794 |
vec_end_1 = c(length(regions$stop)) |
795 |
} |
796 |
if (vec_end_1[length(vec_end_1)] != length(regions$stop)) { |
797 |
vec_end_1 = c(vec_end_1, length(regions$stop)) |
798 |
} |
799 |
vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1) |
800 |
union = apply(t(1:length(vec_beg_1)), 2, function(i) { |
801 |
chr = regions$chr[vec_beg_1[i]] |
802 |
lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]]) |
803 |
upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]]) |
804 |
cur_index = regions$cur_index[vec_beg_1[i]] |
805 |
data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, cur_index=cur_index)) |
806 |
}) |
807 |
union = collapse_regions(union) |
808 |
old_length = length(regions[,1]) |
809 |
new_length = length(union[,1]) |
810 |
regions = union |
811 |
} |
812 |
return(union) |
813 |
} |
814 |
|
815 |
# remove_aligned_wp = function(# Remove wp nucs from common nucs list. |
816 |
# ### It is based on common wp nucs index on nucs and region. |
817 |
# strain_maps, ##<< Nuc maps. |
818 |
# cur_index, ##<< The region of interest index. |
819 |
# tmp_common_nucs, ##<< the list of wp nucs. |
820 |
# strain##<< The strain to consider. |
821 |
# ){ |
822 |
# fuzzy_nucs = strain_maps[[strain]] |
823 |
# fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$cur_index == cur_index,] |
824 |
# fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),] |
825 |
# if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)} |
826 |
# if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"} |
827 |
# anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]] |
828 |
# fuzzy_nucs = fuzzy_nucs[-anti_index_1,] |
829 |
# return(fuzzy_nucs) |
830 |
# } |
831 |
|
832 |
translate_regions = function(# Translate a list of regions from a strain ref to another. |
833 |
### This function is an elaborated call to translate_cur. |
834 |
regions, ##<< Regions to be translated. |
835 |
combi, ##<< Combination of strains. |
836 |
cur_index, ##<< The region of interest index. |
837 |
config=NULL, ##<< GLOBAL config variable |
838 |
roi ##<< The region of interest. |
839 |
) { |
840 |
tr_regions = apply(t(1:length(regions[,1])), 2, function(i) { |
841 |
tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2]) |
842 |
big_cur = roi |
843 |
trs_tmp_regions_ref2 = translate_cur(tmp_regions_ref2, combi[1], config = config, big_cur = big_cur) |
844 |
if (is.null(trs_tmp_regions_ref2)) { |
845 |
return(NULL) |
846 |
} else { |
847 |
return(data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), cur_index=cur_index))) |
848 |
} |
849 |
}) |
850 |
|
851 |
return(collapse_regions(tr_regions)) |
852 |
} |
853 |
|
854 |
collapse_regions = function(# reformat an "apply manipulated" list of regions |
855 |
### Utils to reformat an "apply manipulated" list of regions |
856 |
regions ##< a list of regions |
857 |
) { |
858 |
if (is.null(regions)) { |
859 |
return(NULL) |
860 |
} else { |
861 |
regions = do.call(rbind, regions) |
862 |
regions$chr = as.character(regions$chr) |
863 |
regions$chr = as.numeric(regions$chr) |
864 |
regions$lower_bound = as.numeric(regions$lower_bound) |
865 |
regions$upper_bound = as.numeric(regions$upper_bound) |
866 |
regions = regions[order(regions$lower_bound),] |
867 |
return(regions) |
868 |
} |
869 |
} |
870 |
|
871 |
|
872 |
crop_fuzzy = function(# Crop bound of regions according to region of interest bound |
873 |
### The fucntion is no more necessary since we remove "big_cur" bug in translate_cur function. |
874 |
tmp_fuzzy_nucs, ##<< the regiuons to be croped. |
875 |
roi, ##<< The region of interest. |
876 |
strain, ##<< The strain to consider. |
877 |
config=NULL ##<< GLOBAL config variable |
878 |
) { |
879 |
tr_roi = translate_cur(roi, strain, config = config) |
880 |
tr_roi_begin = min(tr_roi$begin, tr_roi$end) |
881 |
tr_roi_end = max(tr_roi$begin, tr_roi$end) |
882 |
if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) { |
883 |
tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin |
884 |
} |
885 |
if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) { |
886 |
tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin |
887 |
} |
888 |
if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) { |
889 |
tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end |
890 |
} |
891 |
if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) { |
892 |
tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end |
893 |
} |
894 |
tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,] |
895 |
return(tmp_fuzzy_nucs) |
896 |
} |
897 |
|
898 |
get_all_reads = function(# Retrieve Reads |
899 |
### Retrieve reads for a given marker, combi, form. |
900 |
marker, ##<< The marker to considere. |
901 |
combi, ##<< The starin combination to considere. |
902 |
form="wp", ##<< The nuc form to considere. |
903 |
config=NULL ##<< GLOBAL config variable |
904 |
) { |
905 |
all_reads = NULL |
906 |
for (manip in c("Mnase_Seq", marker)) { |
907 |
if (form == "unr") { |
908 |
out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="") |
909 |
tmp_res = read.table(file=out_filename, header=TRUE) |
910 |
tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,] |
911 |
tmp_res$form = form |
912 |
} else if (form == "wp") { |
913 |
out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="") |
914 |
tmp_res = read.table(file=out_filename, header=TRUE) |
915 |
tmp_res$form = form |
916 |
} else if (form == "wpunr") { |
917 |
out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="") |
918 |
tmp_res = read.table(file=out_filename, header=TRUE) |
919 |
tmp_res$form = "wp" |
920 |
out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="") |
921 |
tmp_res2 = read.table(file=out_filename, header=TRUE) |
922 |
tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,] |
923 |
tmp_res2$form = "unr" |
924 |
tmp_res = rbind(tmp_res, tmp_res2) |
925 |
} |
926 |
if (is.null(all_reads)) { |
927 |
all_reads = tmp_res[,c(1:9,length(tmp_res))] |
928 |
} |
929 |
tmp_res = tmp_res[,-c(1:9,length(tmp_res))] |
930 |
all_reads = cbind(all_reads, tmp_res) |
931 |
} |
932 |
return(all_reads) |
933 |
} |
934 |
|
935 |
get_design = function(# Build the design for DESeq |
936 |
### This function build the design according sample properties. |
937 |
marker, ##<< The marker to considere. |
938 |
combi, ##<< The starin combination to considere. |
939 |
all_samples ##<< Global list of samples. |
940 |
) { |
941 |
off1 = 0 |
942 |
off2 = 0 |
943 |
manips = c("Mnase_Seq", marker) |
944 |
design_rownames = c() |
945 |
design_manip = c() |
946 |
design_strain = c() |
947 |
off2index = function(off) { |
948 |
switch(toString(off), |
949 |
"1"=c(0,1,1), |
950 |
"2"=c(1,0,1), |
951 |
"3"=c(1,1,0), |
952 |
c(1,1,1) |
953 |
) |
954 |
} |
955 |
for (manip in manips) { |
956 |
tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) & all_samples$marker == manip), ] |
957 |
tmp_samples = tmp_samples[order(tmp_samples$strain), ] |
958 |
if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) { |
959 |
tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,] |
960 |
} else { |
961 |
if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) { |
962 |
tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,] |
963 |
} |
964 |
} |
965 |
design_manip = c(design_manip, rep(manip, length(tmp_samples$id))) |
966 |
for (strain in combi) { |
967 |
cols = apply(t(tmp_samples[ (tmp_samples$strain == strain & tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")}) |
968 |
design_strain = c(design_strain, rep(strain, length(cols))) |
969 |
design_rownames = c(design_rownames, cols) |
970 |
} |
971 |
} |
972 |
snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain) |
973 |
return(snep_design) |
974 |
} |
975 |
|
976 |
plot_dist_samples = function(# Plot the distribution of reads. |
977 |
### This fuxntion use the DESeq nomalization feature to compare qualitatively the distribution. |
978 |
strain, ##<< The strain to considere. |
979 |
marker, ##<< The marker to considere. |
980 |
res, ##<< Data |
981 |
all_samples, ##<< Global list of samples. |
982 |
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot. |
983 |
) { |
984 |
cols = apply(t(all_samples[ (all_samples$strain == strain & all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")}) |
985 |
snepCountTable = res[,cols] |
986 |
snepDesign = data.frame( |
987 |
row.names = cols, |
988 |
manip = rep(marker, length(cols)), |
989 |
strain = rep(strain, length(cols)) |
990 |
) |
991 |
cdsFull = newCountDataSet(snepCountTable, snepDesign) |
992 |
sizeFactors = estimateSizeFactors(cdsFull) |
993 |
# print(sizeFactors[[1]]) |
994 |
sample_ids = all_samples[ (all_samples$strain == strain & all_samples$marker == marker), ]$id |
995 |
if (NEWPLOT) { |
996 |
plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker)) |
997 |
NEWPLOT = FALSE |
998 |
} |
999 |
for (it in 1:length(sample_ids)) { |
1000 |
sample_id = sample_ids[it] |
1001 |
lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it) |
1002 |
} |
1003 |
legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols) |
1004 |
} |
1005 |
|
1006 |
analyse_design = function(# Launch DESeq methods. |
1007 |
### This function is based on DESeq example. It normalizes data, fit data to GLM model with and without interaction term and compares the two models. |
1008 |
snep_design, ##<< The design to consider. |
1009 |
reads ##<< The data to consider. |
1010 |
) { |
1011 |
snep_count_table = reads[, rownames(snep_design)] |
1012 |
cdsFull = newCountDataSet(snep_count_table, snep_design) |
1013 |
cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
1014 |
fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain) |
1015 |
|
1016 |
cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
1017 |
fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain) |
1018 |
|
1019 |
pvalsGLM = nbinomGLMTest( fit1, fit0 ) |
1020 |
return(list(fit1, fit0, snep_design, pvalsGLM)) |
1021 |
} |
1022 |
|
1023 |
|
1024 |
|
1025 |
|
1026 |
|
1027 |
|
1028 |
|
1029 |
|
1030 |
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form. |
1031 |
### This function uses |
1032 |
marker, ##<< The marker involved. |
1033 |
combi, ##<< The strain combination involved. |
1034 |
form, ##<< the nuc form involved. |
1035 |
all_samples, ##<< Global list of samples. |
1036 |
FDR = 0.0001, ## the specific False Discover Rate |
1037 |
config=NULL ##<< GLOBAL config variable |
1038 |
) { |
1039 |
# PRETREAT |
1040 |
snep_design = get_design(marker, combi, all_samples) |
1041 |
reads = get_all_reads(marker, combi, form, config=config) |
1042 |
# RUN ANALYSE |
1043 |
tmp_analyse = analyse_design(snep_design, reads) |
1044 |
# RESULTS |
1045 |
fit1 = tmp_analyse[[1]] |
1046 |
fit0 = tmp_analyse[[2]] |
1047 |
k = names(fit1) |
1048 |
reads[[k[2]]] = signif(fit1[[k[2]]], 5) |
1049 |
reads[[k[3]]] = signif(fit1[[k[3]]], 5) |
1050 |
reads[[k[4]]] = signif(fit1[[k[4]]], 5) |
1051 |
reads$pvalsGLM = signif(tmp_analyse[[4]], 5) |
1052 |
snep_design = tmp_analyse[[3]] |
1053 |
# print(snep_design) |
1054 |
thres = FDR(reads$pvalsGLM, FDR) |
1055 |
reads$snep_index = reads$pvalsGLM < thres |
1056 |
print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = "")) |
1057 |
return(reads) |
1058 |
}, ex=function(){ |
1059 |
marker = "H3K4me1" |
1060 |
combi = c("BY", "YJM") |
1061 |
form = "wpunr" # "wp" | "unr" | "wpunr" |
1062 |
# foo = get_sneps(marker, combi, form) |
1063 |
# foo = get_sneps("H4K12ac", c("BY", "RM"), "wp") |
1064 |
}) |
1065 |
|
1066 |
|
1067 |
|
1068 |
|
1069 |
|
1070 |
|
1071 |
|
1072 |
|
1073 |
|
1074 |
|
1075 |
|
1076 |
|
1077 |
|
1078 |
|
1079 |
|
1080 |
|
1081 |
|
1082 |
|
1083 |
|
1084 |
|
1085 |
|
1086 |
|
1087 |
|
1088 |
|
1089 |
ROM2ARAB = function(# Roman to Arabic pair list. |
1090 |
### Utility to convert Roman numbers into Arabic numbers |
1091 |
){list( |
1092 |
"I" = 1, |
1093 |
"II" = 2, |
1094 |
"III" = 3, |
1095 |
"IV" = 4, |
1096 |
"V" = 5, |
1097 |
"VI" = 6, |
1098 |
"VII" = 7, |
1099 |
"VIII" = 8, |
1100 |
"IX" = 9, |
1101 |
"X" = 10, |
1102 |
"XI" = 11, |
1103 |
"XII" = 12, |
1104 |
"XIII" = 13, |
1105 |
"XIV" = 14, |
1106 |
"XV" = 15, |
1107 |
"XVI" = 16, |
1108 |
"XVII" = 17, |
1109 |
"XVIII" = 18, |
1110 |
"XIX" = 19, |
1111 |
"XX" = 20 |
1112 |
)} |
1113 |
|
1114 |
switch_pairlist = structure(function(# Switch a pairlist |
1115 |
### Take a pairlist key:value and return the switched pairlist value:key. |
1116 |
l ##<< The pairlist to switch. |
1117 |
) { |
1118 |
ret = list() |
1119 |
for (name in names(l)) { |
1120 |
ret[[as.character(l[[name]])]] = name |
1121 |
} |
1122 |
ret |
1123 |
### The switched pairlist. |
1124 |
}, ex=function(){ |
1125 |
l = list(key1 = "value1", key2 = "value2") |
1126 |
print(switch_pairlist(l)) |
1127 |
}) |
1128 |
|
1129 |
ARAB2ROM = function(# Arabic to Roman pair list. |
1130 |
### Utility to convert Arabic numbers to Roman numbers |
1131 |
){switch_pairlist(ROM2ARAB())} |
1132 |
|
1133 |
|
1134 |
|
1135 |
|
1136 |
c2c_extraction = function(# Extract a sub part of the corresponding c2c file |
1137 |
### This fonction allows to access to a specific part of the c2c file. |
1138 |
strain1, ##<< the key strain |
1139 |
strain2, ##<< the target strain |
1140 |
chr=NULL, ##<< if defined, the c2c will be filtered according to the chromosome value |
1141 |
lower_bound=NULL, ##<< if defined, the c2c will be filtered for part of the genome upper than lower_bound |
1142 |
upper_bound=NULL, ##<< if defined, the c2c will be filtered for part of the genome lower than upper_bound |
1143 |
config=NULL##<< GLOBAL config variable |
1144 |
) { |
1145 |
reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM" |
1146 |
# Launch c2c file |
1147 |
if (reverse) { |
1148 |
c2c_filename = config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]] |
1149 |
} else { |
1150 |
c2c_filename = config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]] |
1151 |
} |
1152 |
c2c = get_content(c2c_filename, "table", stringsAsFactors=FALSE) |
1153 |
# Filtering unagapped |
1154 |
c2c = c2c[c2c$V6=="-",] |
1155 |
# Reverse |
1156 |
if (reverse) { |
1157 |
tmp_col = c2c$V1 |
1158 |
c2c$V1 = c2c$V7 |
1159 |
c2c$V7 = tmp_col |
1160 |
tmp_col = c2c$V2 |
1161 |
c2c$V2 = c2c$V9 |
1162 |
c2c$V9 = tmp_col |
1163 |
tmp_col = c2c$V3 |
1164 |
c2c$V3 = c2c$V10 |
1165 |
c2c$V10 = tmp_col |
1166 |
} |
1167 |
if (!is.null(chr)) { |
1168 |
if (strain1 == "BY") { |
1169 |
chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="") |
1170 |
} else if (strain1 == "RM") { |
1171 |
chro_1 = paste("supercontig_1.",chr,sep="") |
1172 |
} else if (strain1 == "YJM") { |
1173 |
chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]] |
1174 |
} |
1175 |
c2c = c2c[c2c$V1 == chro_1,] |
1176 |
if (!is.null(lower_bound)) { |
1177 |
if (length(c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3,c("V2", "V3") ] = lower_bound} |
1178 |
if (length(c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2,c("V2", "V3") ] = lower_bound} |
1179 |
c2c = c2c[c2c$V2 - c2c$V3 != 0,] |
1180 |
} |
1181 |
if (!is.null(upper_bound)) { |
1182 |
if (length(c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, c("V2", "V3")] = upper_bound} |
1183 |
if (length(c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, c("V2", "V3")] = upper_bound} |
1184 |
c2c = c2c[c2c$V2 - c2c$V3 != 0,] |
1185 |
} |
1186 |
} |
1187 |
return(c2c) |
1188 |
# It retruns the appropriate c2c file part. |
1189 |
} |
1190 |
|
1191 |
translate_cur = structure(function(# Translate coords of a genome region. |
1192 |
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example. |
1193 |
roi, ##<< Original genome region of interest. |
1194 |
strain2, ##<< The strain in wich you want the genome region of interest. |
1195 |
config=NULL, ##<< GLOBAL config variable |
1196 |
big_cur=NULL ##<< A largest region than roi use to filter c2c if it is needed. |
1197 |
) { |
1198 |
strain1 = roi$strain_ref |
1199 |
# Do something or nothing? |
1200 |
if (is.null(config$NEUTRAL_TRANSLATE_CUR)) { |
1201 |
config$NEUTRAL_TRANSLATE_CUR = FALSE |
1202 |
} |
1203 |
if (strain1 == strain2 | config$NEUTRAL_TRANSLATE_CUR) { |
1204 |
roi$strain_ref = strain2 |
1205 |
roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) |
1206 |
return(roi) |
1207 |
} |
1208 |
|
1209 |
# Extract c2c file |
1210 |
if (!is.null(big_cur)) { |
1211 |
# Dealing with big_cur |
1212 |
if (roi$strain_ref != big_cur$strain_ref) { |
1213 |
big_cur = translate_cur(big_cur, roi$strain_ref, config=config) |
1214 |
} |
1215 |
if (big_cur$end < big_cur$begin) { |
1216 |
tmp_var = big_cur$begin |
1217 |
big_cur$begin = big_cur$end |
1218 |
big_cur$end = tmp_var |
1219 |
big_cur$length = big_cur$end - big_cur$begin + 1 |
1220 |
} |
1221 |
if (big_cur$chr!=roi$chr | roi$end > big_cur$end | roi$end < big_cur$begin | roi$begin > big_cur$end | roi$begin < big_cur$begin) { |
1222 |
print("WARNING! Trying to translate a roi not included in a big_cur.") |
1223 |
return(NULL) |
1224 |
} |
1225 |
c2c = c2c_extraction(strain1, strain2, big_cur$chr, big_cur$begin, big_cur$end, config=config) |
1226 |
} else { |
1227 |
# No big_cur |
1228 |
c2c = c2c_extraction(strain1, strain2, roi$chr, config=config) |
1229 |
} |
1230 |
|
1231 |
# Convert initial roi$chr into c2c format |
1232 |
reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM" |
1233 |
begin_1 = roi$begin |
1234 |
end_1 = roi$end |
1235 |
if (reverse) { |
1236 |
tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),] |
1237 |
tmptransfostop = c2c[(c2c$V3>=end_1 & c2c$V2<=end_1 & c2c$V8==1) | (c2c$V2>=end_1 & c2c$V3<=end_1 & c2c$V8==-1),] |
1238 |
} else { |
1239 |
tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,] |
1240 |
tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,] |
1241 |
} |
1242 |
|
1243 |
# Never happend conditions ... |
1244 |
{ |
1245 |
if (length(tmptransfostart$V8) == 0) { |
1246 |
# begin_1 is between to lines: shift begin_1 to the start of 2nd line. |
1247 |
if (sum(c2c$V3 >= c2c$V2) != 0) { |
1248 |
tmp_c2c = c2c[c2c$V2>=begin_1,] |
1249 |
begin_1 = min(tmp_c2c$V2) |
1250 |
} else { |
1251 |
tmp_c2c = c2c[c2c$V3>=begin_1,] |
1252 |
begin_1 = min(tmp_c2c$V3) |
1253 |
} |
1254 |
if (reverse) { |
1255 |
tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),] |
1256 |
} else { |
1257 |
tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,] |
1258 |
} |
1259 |
if (length(tmptransfostart$V8) == 0) { |
1260 |
if (!is.null(big_cur)) { |
1261 |
return(NULL) |
1262 |
tmptransfostart = c2c[c2c$V3>=big_cur$begin & c2c$V2<=big_cur$begin,] |
1263 |
} else { |
1264 |
print(tmptransfostart) |
1265 |
print(tmptransfostop) |
1266 |
stop("Never happend condition 1.") |
1267 |
} |
1268 |
} |
1269 |
} |
1270 |
if (length(tmptransfostop$V8) == 0) { |
1271 |
# end_1 is between to lines: shift end_1 to the end of 2nd line. |
1272 |
if (sum(c2c$V3 >= c2c$V2) != 0) { |
1273 |
tmp_c2c = c2c[c2c$V3<=end_1,] |
1274 |
end_1 = max(tmp_c2c$V3) |
1275 |
} else { |
1276 |
tmp_c2c = c2c[c2c$V2<=end_1,] |
1277 |
end_1 = max(tmp_c2c$V2) |
1278 |
} |
1279 |
if (reverse) { |
1280 |
tmptransfostop = c2c[(c2c$V3>=end_1 & c2c$V2<=end_1 & c2c$V8==1) | (c2c$V2>=end_1 & c2c$V3<=end_1 & c2c$V8==-1),] |
1281 |
} else { |
1282 |
tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,] |
1283 |
} |
1284 |
if (length(tmptransfostop$V8) == 0) { |
1285 |
if (!is.null(big_cur)) { |
1286 |
return(NULL) |
1287 |
tmptransfostop = c2c[c2c$V3>=big_cur$end & c2c$V2<=big_cur$end,] |
1288 |
} else { |
1289 |
print(tmptransfostart) |
1290 |
print(tmptransfostop) |
1291 |
stop("Never happend condition 2.") |
1292 |
} |
1293 |
} |
1294 |
} |
1295 |
if (length(tmptransfostart$V8) != 1) { |
1296 |
# print("many start") |
1297 |
tmptransfostart = tmptransfostart[tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,] |
1298 |
if (length(tmptransfostart$V8) != 1) { |
1299 |
print(tmptransfostart) |
1300 |
print(tmptransfostop) |
1301 |
stop("Never happend condition 3.") |
1302 |
} |
1303 |
} |
1304 |
if (length(tmptransfostop$V8) != 1) { |
1305 |
# print("many stop") |
1306 |
tmptransfostop = tmptransfostop[tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,] |
1307 |
if (length(tmptransfostop$V8) != 1) { |
1308 |
print(tmptransfostart) |
1309 |
print(tmptransfostop) |
1310 |
stop("Never happend condition 4.") |
1311 |
} |
1312 |
} |
1313 |
if (tmptransfostart$V7 != tmptransfostop$V7) { |
1314 |
print(tmptransfostart) |
1315 |
print(tmptransfostop) |
1316 |
stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.") |
1317 |
} |
1318 |
} |
1319 |
|
1320 |
# Deal with strand |
1321 |
if (tmptransfostart$V8 == 1) { |
1322 |
begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2) |
1323 |
end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2) |
1324 |
} else { |
1325 |
begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2) |
1326 |
end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2) |
1327 |
} |
1328 |
# Build returned roi |
1329 |
roi$strain_ref = strain2 |
1330 |
if (roi$strain_ref == "BY") { |
1331 |
roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]] |
1332 |
} else { |
1333 |
roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]] |
1334 |
} |
1335 |
roi$begin = begin_2 |
1336 |
roi$end = end_2 |
1337 |
if (sign(roi$end - roi$begin) == 0) { |
1338 |
roi$length = 1 |
1339 |
} else { |
1340 |
roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1 |
1341 |
} |
1342 |
return(roi) |
1343 |
}, ex=function(){ |
1344 |
# Define new translate_cur function... |
1345 |
translate_cur = function(roi, strain2, config) { |
1346 |
strain1 = roi$strain_ref |
1347 |
if (strain1 == strain2) { |
1348 |
return(roi) |
1349 |
} else { |
1350 |
stop("Here is my new translate_cur function...") |
1351 |
} |
1352 |
} |
1353 |
# Binding it by uncomment follwing lines. |
1354 |
# unlockBinding("translate_cur", as.environment("package:nm")) |
1355 |
# unlockBinding("translate_cur", getNamespace("nm")) |
1356 |
# assign("translate_cur", translate_cur, "package:nm") |
1357 |
# assign("translate_cur", translate_cur, getNamespace("nm")) |
1358 |
# lockBinding("translate_cur", getNamespace("nm")) |
1359 |
# lockBinding("translate_cur", as.environment("package:nm")) |
1360 |
}) |
1361 |
|
1362 |
|
1363 |
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR) |
1364 |
### CURs are regions that can be aligned between the genomes |
1365 |
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR |
1366 |
min_cur_width = 4000, ##<< The minimum width of a CUR |
1367 |
config = NULL ##<< GLOBAL config variable |
1368 |
) { |
1369 |
|
1370 |
check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) { |
1371 |
c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config) |
1372 |
check_homogeneity(c2c) |
1373 |
if (length(c2c[,1]) == 0 ) { |
1374 |
stop("WARNING! checking overlapping for a region corresponding to an empty c2c.") |
1375 |
} else { |
1376 |
lower_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; min(l$V2, l$V3)}) |
1377 |
upper_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; max(l$V2, l$V3)}) |
1378 |
tmp_index = order(lower_bounds) |
1379 |
lower_bounds = lower_bounds[tmp_index] |
1380 |
upper_bounds = upper_bounds[tmp_index] |
1381 |
tmp_diff = lower_bounds[-1] - upper_bounds[-length(upper_bounds)] |
1382 |
ov_index = which(tmp_diff < 0) |
1383 |
if(length(ov_index < 0) !=0 ) { |
1384 |
ov_index = ov_index[1] |
1385 |
print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ", c2c[1,]$V1, sep="")) |
1386 |
c2c_corrupted = c2c[tmp_index,][c(ov_index, ov_index + 1),] |
1387 |
print(c2c_corrupted) |
1388 |
return(list(lower_bounds[ov_index+1] - 1, upper_bounds[ov_index] + 1)) |
1389 |
} |
1390 |
return(NULL) |
1391 |
} |
1392 |
} |
1393 |
|
1394 |
check_homogeneity = function(sub_c2c) { |
1395 |
tmp_signs = sign(sub_c2c$V2 - sub_c2c$V3) |
1396 |
tmp_signs = tmp_signs[tmp_signs != 0] |
1397 |
if (sum(tmp_signs[1] != tmp_signs)) { |
1398 |
print(paste("*************** ERROR, non homogenous region (sign)! ********************")) |
1399 |
print(tmp_signs) |
1400 |
} |
1401 |
tmp_signs2 = sign(sub_c2c$V9 - sub_c2c$V10) |
1402 |
tmp_signs2 = tmp_signs2[tmp_signs2 != 0] |
1403 |
if (sum(tmp_signs2[1] != tmp_signs2)) { |
1404 |
print(paste("*************** ERROR, non homogenous region (sign2)! ********************")) |
1405 |
print(tmp_signs2) |
1406 |
} |
1407 |
if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) { |
1408 |
print("*************** ERROR, non homogenous region chrs or V8! ********************") |
1409 |
} |
1410 |
} |
1411 |
|
1412 |
test_and_squeeze_rois = function(foo, config=NULL) { |
1413 |
is_it_ok = function(list1, list2) { |
1414 |
bar = cbind(list1$begin, list2$begin, abs(list1$begin - list2$begin), list1$end, list2$end, abs(list1$end - list2$end), list1$length, list2$length, abs(list1$length - list2$length)) |
1415 |
ok = length(bar[bar[,3] != 0 | bar[,6] != 0, ]) == 0 |
1416 |
if (!ok) { |
1417 |
print(bar[bar[,3] != 0 | bar[,6] != 0, ]) |
1418 |
} |
1419 |
return (ok) |
1420 |
} |
1421 |
squeeze_rois = function(list1, list2) { |
1422 |
rois = apply(t(1:nrow(list1)), 2, function(i){ |
1423 |
roi = list1[i,] |
1424 |
roi2 = list2[i,] |
1425 |
roi$begin = max(roi$begin, roi2$begin) |
1426 |
roi$end = min(roi$end, roi2$end) |
1427 |
roi$length = roi$end - roi$begin + 1 |
1428 |
return(roi) |
1429 |
}) |
1430 |
return(do.call(rbind, rois)) |
1431 |
} |
1432 |
# foo_orig = compute_inter_all_strain_curs2(config=config) |
1433 |
# foo = foo_orig |
1434 |
STOP = FALSE |
1435 |
nb_round = 0 |
1436 |
while(!STOP) { |
1437 |
nb_round = nb_round + 1 |
1438 |
print(paste("2-2 round #", nb_round, sep="")) |
1439 |
fooby = translate_curs(foo, "BY", config=config) |
1440 |
fooyjm = translate_curs(foo, "YJM", config=config) |
1441 |
fooyjmby = translate_curs(fooyjm, "BY", config=config) |
1442 |
if (!is_it_ok(fooby, fooyjmby)) { |
1443 |
print("case 1") |
1444 |
foo = squeeze_rois(fooby, fooyjmby) |
1445 |
next |
1446 |
} |
1447 |
foorm = translate_curs(foo, "RM", config=config) |
1448 |
foormby = translate_curs(foorm, "BY", config=config) |
1449 |
if (!is_it_ok(fooby, foormby)) { |
1450 |
print("case 2") |
1451 |
foo = squeeze_rois(fooby, foormby) |
1452 |
next |
1453 |
} |
1454 |
fooyjmrm = translate_curs(fooyjm, "RM", config=config) |
1455 |
fooyjmrmyjm = translate_curs(fooyjmrm, "YJM", config=config) |
1456 |
if (!is_it_ok(fooyjm, fooyjmrmyjm)) { |
1457 |
print("case 3") |
1458 |
foo = squeeze_rois(fooyjm, fooyjmrmyjm) |
1459 |
next |
1460 |
} |
1461 |
foormyjm = translate_curs(foorm, "YJM", config=config) |
1462 |
foormyjmrm = translate_curs(foormyjm, "RM", config=config) |
1463 |
if (!is_it_ok(foorm, foormyjmrm)) { |
1464 |
print("case 4") |
1465 |
foo = squeeze_rois(foorm, foormyjmrm) |
1466 |
next |
1467 |
} |
1468 |
foo = translate_curs(foo, "BY", config=config) |
1469 |
STOP = TRUE |
1470 |
} |
1471 |
STOP = FALSE |
1472 |
nb_round = 0 |
1473 |
while(!STOP) { |
1474 |
nb_round = nb_round + 1 |
1475 |
print(paste("3-3 round #", nb_round, sep="")) |
1476 |
fooby = translate_curs(foo, "BY", config=config) |
1477 |
foobyrm = translate_curs(fooby, "RM", config=config) |
1478 |
foobyrmyjm = translate_curs(foobyrm, "YJM", config=config) |
1479 |
foobyrmyjmby = translate_curs(foobyrmyjm, "BY", config=config) |
1480 |
if (!is_it_ok(fooby, foobyrmyjmby)) { |
1481 |
print("case 1") |
1482 |
foo = squeeze_rois(fooby, foobyrmyjmby) |
1483 |
} |
1484 |
fooby = translate_curs(foo, "BY", config=config) |
1485 |
foobyyjm = translate_curs(fooby, "YJM", config=config) |
1486 |
foobyyjmrm = translate_curs(foobyyjm, "RM", config=config) |
1487 |
foobyyjmrmby = translate_curs(foobyyjmrm, "BY", config=config) |
1488 |
if (!is_it_ok(fooby, foobyyjmrmby)) { |
1489 |
print("case 2") |
1490 |
foo = squeeze_rois(fooby, foobyyjmrmby) |
1491 |
next |
1492 |
} |
1493 |
foo = translate_curs(foo, "BY", config=config) |
1494 |
STOP = TRUE |
1495 |
} |
1496 |
print("end") |
1497 |
return(foo) |
1498 |
} |
1499 |
|
1500 |
get_inter_strain_rois = function(strain1, strain2, diff_allowed = 30, min_cur_width = 200, config=NULL) { |
1501 |
c2c = c2c_extraction(strain1, strain2, config=config) |
1502 |
# computing diffs |
1503 |
diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)] |
1504 |
diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)] |
1505 |
# Filtering |
1506 |
indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed) |
1507 |
indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1)) |
1508 |
rois = apply(t(1:length(indexes_start)), 2, function(i) { |
1509 |
if ( i %% 20 == 1) print(paste(i, "/", length(indexes_start))) |
1510 |
returned_rois = NULL |
1511 |
start = indexes_start[i] |
1512 |
stop = indexes_stop[i] |
1513 |
sub_c2c = c2c[start:stop,] |
1514 |
check_homogeneity(sub_c2c) |
1515 |
if (strain1 == "BY") { |
1516 |
chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]] |
1517 |
} else { |
1518 |
chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]] |
1519 |
} |
1520 |
roi = list(chr=chr, begin=min(c(sub_c2c$V2,sub_c2c$V3)), end=max(c(sub_c2c$V2,sub_c2c$V3)), strain_ref=strain1) |
1521 |
roi$length = roi$end - roi$begin + 1 |
1522 |
if (roi$length >= min_cur_width) { |
1523 |
lower_bound = roi$begin |
1524 |
upper_bound = roi$end |
1525 |
check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config) |
1526 |
while(!is.null(check)) { |
1527 |
# print(check) |
1528 |
roi1 = roi |
1529 |
roi1$end = check[[1]] |
1530 |
roi1$length = roi1$end - roi1$begin + 1 |
1531 |
if (roi1$length >= min_cur_width) { |
1532 |
returned_rois = dfadd(returned_rois,roi1) |
1533 |
} |
1534 |
roi$begin = check[[2]] |
1535 |
roi$length = roi$end - roi$begin + 1 |
1536 |
if (roi$length >= min_cur_width) { |
1537 |
lower_bound = min(roi$begin, roi$end) |
1538 |
upper_bound = max(roi$begin, roi$end) |
1539 |
check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config) |
1540 |
} else { |
1541 |
check = NULL |
1542 |
roi = NULL |
1543 |
} |
1544 |
} |
1545 |
returned_rois = dfadd(returned_rois,roi) |
1546 |
} |
1547 |
}) |
1548 |
rois = do.call(rbind,rois) |
1549 |
rois = rois[order(as.numeric(rois$chr), rois$begin), ] |
1550 |
return(rois) |
1551 |
} |
1552 |
|
1553 |
translate_curs = function(rois, target_strain, config) { |
1554 |
tr_rois = apply(t(1:nrow(rois)), 2, function(i){ |
1555 |
roi = rois[i,] |
1556 |
tr_roi = translate_cur(roi, target_strain, config=config) |
1557 |
tmp_begin = min(tr_roi$begin, tr_roi$end) |
1558 |
tmp_end = max(tr_roi$begin, tr_roi$end) |
1559 |
tr_roi$begin = tmp_begin |
1560 |
tr_roi$end = tmp_end |
1561 |
tr_roi$length = tr_roi$end - tr_roi$begin + 1 |
1562 |
return(tr_roi) |
1563 |
}) |
1564 |
tr_rois = do.call(rbind, tr_rois) |
1565 |
return(tr_rois) |
1566 |
} |
1567 |
|
1568 |
combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM")) |
1569 |
rois = list() |
1570 |
for (combi in combis) { |
1571 |
strain1 = combi[1] |
1572 |
strain2 = combi[2] |
1573 |
print(paste(strain1, strain2)) |
1574 |
rois_fwd = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config) |
1575 |
strain1 = combi[2] |
1576 |
strain2 = combi[1] |
1577 |
print(paste(strain1, strain2)) |
1578 |
rois_rev = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config) |
1579 |
tr_rois_rev = translate_curs(rois_rev, combi[1], config) |
1580 |
region1 = rois_fwd |
1581 |
region2 = tr_rois_rev |
1582 |
rois[[paste(combi[1], combi[2], sep="_")]] = intersect_region(rois_fwd, tr_rois_rev) |
1583 |
} |
1584 |
reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]]) |
1585 |
reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ] |
1586 |
tr_reducted_1_rois = translate_curs(reducted_1_rois, "RM", config) |
1587 |
reducted_2_rois = intersect_region(tr_reducted_1_rois, rois[["RM_YJM"]]) |
1588 |
reducted_2_rois = reducted_2_rois[reducted_2_rois$length >= min_cur_width, ] |
1589 |
reducted_rois = translate_curs(reducted_2_rois, "BY", config) |
1590 |
reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ] |
1591 |
squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config) |
1592 |
return (squeezed_rois) |
1593 |
} |
1594 |
|
1595 |
intersect_region = function(# Returns the intersection of 2 list on regions. |
1596 |
### This function... |
1597 |
region1, ##<< Original regions. |
1598 |
region2 ##<< Regions to intersect. |
1599 |
) { |
1600 |
intersection = apply(t(1:nrow(region1)), 2, function(i) { |
1601 |
roi1 = region1[i, ] |
1602 |
sub_regions2 = region2[region2$chr == roi1$chr, ] |
1603 |
sub_regions2 = sub_regions2[roi1$begin <= sub_regions2$begin & sub_regions2$begin <= roi1$end | |
1604 |
roi1$begin <= sub_regions2$end & sub_regions2$end <= roi1$end | |
1605 |
sub_regions2$begin < roi1$begin & roi1$end < sub_regions2$end |
1606 |
, ] |
1607 |
if (nrow(sub_regions2) == 0) { |
1608 |
print("removing a region") |
1609 |
return(NULL) |
1610 |
} else if (nrow(sub_regions2) > 1) { |
1611 |
print("more than one region in intersect_region") |
1612 |
return(do.call(rbind, apply(t(1:nrow(sub_regions2)), 2, function(i) {intersect_region(roi1, sub_regions2[i,])}))) |
1613 |
} else { |
1614 |
roi2 = sub_regions2[1,] |
1615 |
if (roi1$begin < roi2$begin) { |
1616 |
print("not the same begin") |
1617 |
roi1$begin = roi2$begin |
1618 |
roi1$length = roi1$end - roi1$begin + 1 |
1619 |
} |
1620 |
if (roi1$end > roi2$end) { |
1621 |
print("not the same end") |
1622 |
roi1$end = roi2$end |
1623 |
roi1$length = roi1$end - roi1$begin + 1 |
1624 |
} |
1625 |
return(roi1) |
1626 |
} |
1627 |
}) |
1628 |
return(do.call(rbind,intersection)) |
1629 |
} |
1630 |
|
1631 |
build_replicates = structure(function(# Stage replicates data |
1632 |
### This function loads in memory the data corresponding to the given experiments. |
1633 |
expe, ##<< a list of vectors corresponding to replicates. |
1634 |
roi, ##<< the region that we are interested in. |
1635 |
only_fetch=FALSE, ##<< filter or not inputs. |
1636 |
get_genome=FALSE,##<< Load or not corresponding genome. |
1637 |
all_samples, ##<< Global list of samples. |
1638 |
config=NULL ##<< GLOBAL config variable. |
1639 |
) { |
1640 |
build_samples = function(samples_ids, roi, only_fetch=FALSE, get_genome=TRUE, get_ouputs=TRUE, all_samples) { |
1641 |
samples=list() |
1642 |
for (i in samples_ids) { |
1643 |
sample = as.list(all_samples[all_samples$id==i,]) |
1644 |
sample$orig_roi = roi |
1645 |
sample$roi = translate_cur(roi, sample$strain, config = config) |
1646 |
if (get_genome) { |
1647 |
# Get Genome |
1648 |
fasta_ref_filename = config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]] |
1649 |
sample$roi$genome = get_content(fasta_ref_filename, "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end] |
1650 |
} |
1651 |
# Get inputs |
1652 |
sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="") |
1653 |
sample$inputs = get_content(sample_inputs_filename, "table", stringsAsFactors=FALSE) |
1654 |
sample$total_reads = sum(sample$inputs[,4]) |
1655 |
if (!only_fetch) { |
1656 |
sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300, filter_for_coverage=TRUE) |
1657 |
} |
1658 |
# Get TF outputs for Mnase_Seq samples |
1659 |
if (sample$marker == "Mnase_Seq" & get_ouputs) { |
1660 |
sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="") |
1661 |
sample$outputs = get_content(sample_outputs_filename, "table", header=TRUE, sep="\t") |
1662 |
if (!only_fetch) { |
1663 |
sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300) |
1664 |
} |
1665 |
} |
1666 |
samples[[length(samples) + 1]] = sample |
1667 |
} |
1668 |
return(samples) |
1669 |
} |
1670 |
replicates = list() |
1671 |
for(samples_ids in expe) { |
1672 |
samples = build_samples(samples_ids, roi, only_fetch=only_fetch, get_genome=get_genome, all_samples=all_samples) |
1673 |
replicates[[length(replicates) + 1]] = samples |
1674 |
} |
1675 |
return(replicates) |
1676 |
}, ex = function() { |
1677 |
# library(rjson) |
1678 |
# library(nucleominer) |
1679 |
# |
1680 |
# # Read config file |
1681 |
# json_conf_file = "nucleominer_config.json" |
1682 |
# config = fromJSON(paste(readLines(json_conf_file), collapse="")) |
1683 |
# # Read sample file |
1684 |
# all_samples = get_content(config$CSV_SAMPLE_FILE, "cvs", sep=";", head=TRUE, stringsAsFactors=FALSE) |
1685 |
# # here are the sample ids in a list |
1686 |
# expes = list(c(1)) |
1687 |
# # here is the region that we wnt to see the coverage |
1688 |
# cur = list(chr="8", begin=472000, end=474000, strain_ref="BY") |
1689 |
# # it displays the corverage |
1690 |
# replicates = build_replicates(expes, cur, all_samples=all_samples, config=config) |
1691 |
# out = watch_samples(replicates, config$READ_LENGTH, |
1692 |
# plot_coverage = TRUE, |
1693 |
# plot_squared_reads = FALSE, |
1694 |
# plot_ref_genome = FALSE, |
1695 |
# plot_arrow_raw_reads = FALSE, |
1696 |
# plot_arrow_nuc_reads = FALSE, |
1697 |
# plot_gaussian_reads = FALSE, |
1698 |
# plot_gaussian_unified_reads = FALSE, |
1699 |
# plot_ellipse_nucs = FALSE, |
1700 |
# plot_wp_nucs = FALSE, |
1701 |
# plot_wp_nuc_model = FALSE, |
1702 |
# plot_common_nucs = FALSE, |
1703 |
# height = 50) |
1704 |
}) |
1705 |
|
1706 |
|
1707 |
|
1708 |
|
1709 |
|
1710 |
|
1711 |
|
1712 |
|
1713 |
|
1714 |
|
1715 |
|
1716 |
|
1717 |
|
1718 |
|
1719 |
|
1720 |
|
1721 |
|
1722 |
|
1723 |
|
1724 |
|
1725 |
|
1726 |
|
1727 |
|
1728 |
|
1729 |
|
1730 |
|
1731 |
|
1732 |
|
1733 |
|
1734 |
|
1735 |
|
1736 |
|
1737 |
|
1738 |
|
1739 |
|
1740 |
|
1741 |
|
1742 |
|
1743 |
|
1744 |
|
1745 |
|
1746 |
|
1747 |
|
1748 |
|
1749 |
|
1750 |
|
1751 |
|
1752 |
|
1753 |
|
1754 |
|
1755 |
|
1756 |
|
1757 |
|
1758 |
|
1759 |
|
1760 |
|
1761 |
|
1762 |
|
1763 |
|
1764 |
|
1765 |
|
1766 |
|
1767 |
|
1768 |
|
1769 |
|
1770 |
|
1771 |
|
1772 |
|
1773 |
|
1774 |
|
1775 |
|
1776 |
|
1777 |
|
1778 |
|
1779 |
|
1780 |
|
1781 |
|
1782 |
|
1783 |
|
1784 |
|
1785 |
|
1786 |
|
1787 |
|
1788 |
watch_samples = function(# Watching analysis of samples |
1789 |
### This function allows to view analysis for a particuler region of the genome. |
1790 |
replicates, ##<< replicates under the form... |
1791 |
read_length, ##<< length of the reads |
1792 |
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome. |
1793 |
plot_arrow_raw_reads = TRUE, ##<< Plot (or not) arrows for raw reads. |
1794 |
plot_arrow_nuc_reads = TRUE, ##<< Plot (or not) arrows for reads aasiocied to a nucleosome. |
1795 |
plot_squared_reads = TRUE, ##<< Plot (or not) reads in the square fashion. |
1796 |
plot_coverage = FALSE, ##<< Plot (or not) reads in the covergae fashion. fashion. |
1797 |
plot_gaussian_reads = TRUE, ##<< Plot (or not) gaussian model of a F anf R reads. |
1798 |
plot_gaussian_unified_reads = TRUE, ##<< Plot (or not) gaussian model of a nuc. |
1799 |
plot_ellipse_nucs = TRUE, ##<< Plot (or not) ellipse for a nuc. |
1800 |
change_col = TRUE, ##<< Change the color of each nucleosome. |
1801 |
plot_wp_nucs = TRUE, ##<< Plot (or not) cluster of nucs |
1802 |
plot_fuzzy_nucs = FALSE, ##<< Plot (or not) cluster of fuzzy |
1803 |
plot_wp_nuc_model = TRUE, ##<< Plot (or not) gaussian model for a cluster of nucs |
1804 |
plot_common_nucs = FALSE, ##<< Plot (or not) aligned reads. |
1805 |
plot_common_unrs = FALSE, ##<< Plot (or not) unaligned nucleosomal refgions (UNRs). |
1806 |
plot_wp_nucs_4_nonmnase = FALSE, ##<< Plot (or not) clusters for non inputs samples. |
1807 |
plot_chain = FALSE, ##<< Plot (or not) clusterised nuceosomes between mnase samples. |
1808 |
plot_sample_id = FALSE, ##<< Plot (or not) the sample id for each sample. |
1809 |
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed. |
1810 |
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed. |
1811 |
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis. |
1812 |
main=NULL, ##<< main title of the produced plot |
1813 |
xlab=NULL, ##<< xlab of the produced plot |
1814 |
ylab="#reads (per million reads)", ##<< ylab of the produced plot |
1815 |
config=NULL ##<< GLOBAL config variable |
1816 |
){ |
1817 |
returned_list = list() |
1818 |
# Computing global display parameters |
1819 |
if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) { |
1820 |
x_min_glo = replicates[[1]][[1]]$roi[["begin"]] |
1821 |
x_max_glo = replicates[[1]][[1]]$roi[["end"]] |
1822 |
} else { |
1823 |
x_min_glo = - replicates[[1]][[1]]$roi[["begin"]] |
1824 |
x_max_glo = - replicates[[1]][[1]]$roi[["end"]] |
1825 |
} |
1826 |
base_glo = 0 |
1827 |
nb_rank_glo = 0 |
1828 |
for (samples in replicates) { |
1829 |
nb_rank_glo = nb_rank_glo + length(samples) |
1830 |
} |
1831 |
ylim_glo = c(base_glo, base_glo + height * nb_rank_glo) |
1832 |
y_min_glo = min(ylim_glo) |
1833 |
y_max_glo = max(ylim_glo) |
1834 |
delta_y_glo = y_max_glo - y_min_glo |
1835 |
# Plot main frame |
1836 |
if (is.null(xlab)) { |
1837 |
xlab = paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr) |
1838 |
} |
1839 |
plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab=ylab, xlab=xlab, main=main ) |
1840 |
axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2)) |
1841 |
# Go |
1842 |
replicates_wp_nucs = list() |
1843 |
wp_maps = list() |
1844 |
fuzzy_maps = list() |
1845 |
for (replicate_rank in 1:length(replicates)) { |
1846 |
# Computing replicate parameters |
1847 |
nb_rank = length(samples) |
1848 |
base = (replicate_rank-1) * height * nb_rank |
1849 |
ylim = c(base, base + height * nb_rank) |
1850 |
y_min = min(ylim) |
1851 |
y_max = max(ylim) |
1852 |
delta_y = y_max - y_min |
1853 |
samples = replicates[[replicate_rank]] |
1854 |
for (sample_rank in 1:length(samples)) { |
1855 |
# computing sample parameters |
1856 |
sample = samples[[sample_rank]] |
1857 |
y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank |
1858 |
if (plot_sample_id) { |
1859 |
text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep="")) |
1860 |
} |
1861 |
|
1862 |
if (sample$roi[["begin"]] < sample$roi[["end"]]) { |
1863 |
x_min = sample$roi[["begin"]] |
1864 |
x_max = sample$roi[["end"]] |
1865 |
} else { |
1866 |
x_min = - sample$roi[["begin"]] |
1867 |
x_max = - sample$roi[["end"]] |
1868 |
} |
1869 |
shift = x_min_glo - x_min |
1870 |
# Plot Genome seq |
1871 |
if (plot_ref_genome) { |
1872 |
text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier") |
1873 |
} |
1874 |
# Plot reads |
1875 |
reads = sample$inputs |
1876 |
signs = sign_from_strand(reads[,3]) |
1877 |
if (plot_arrow_raw_reads) { |
1878 |
arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, |
1879 |
col=1, length=0.15/nb_rank) |
1880 |
} |
1881 |
if (plot_squared_reads) { |
1882 |
# require(plotrix) |
1883 |
rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift, y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0) |
1884 |
} |
1885 |
if (plot_coverage) { |
1886 |
if (length(reads[,1]) != 0) { |
1887 |
step_h = sign(x_min) * signs * reads[,4] |
1888 |
step_b = sign(x_min) * reads[,2] + shift |
1889 |
step_e = sign(x_min) * (reads[,2] + signs * 150) + shift |
1890 |
steps_x = min(step_b, step_e):max(step_b, step_e) |
1891 |
steps_y = rep(0, length(steps_x)) |
1892 |
for (i in 1:length(step_h)) { |
1893 |
steps_y[which(steps_x==min(step_b[i], step_e[i]))] = steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i]) |
1894 |
steps_y[which(steps_x==max(step_b[i], step_e[i]))] = steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i]) |
1895 |
} |
1896 |
tmp_index = which(steps_y != 0) |
1897 |
steps_x = steps_x[tmp_index] |
1898 |
steps_y = steps_y[tmp_index] |
1899 |
tmp_current_level = 0 |
1900 |
for (i in 1:length(steps_y)) { |
1901 |
steps_y[i] = tmp_current_level + steps_y[i] |
1902 |
tmp_current_level = steps_y[i] |
1903 |
} |
1904 |
steps_y = c(0, steps_y) |
1905 |
steps_y = steps_y * 1000000/sample$total_reads |
1906 |
} else { |
1907 |
steps_y = c(0, 0, 0) |
1908 |
steps_x = c(x_min, x_max) |
1909 |
} |
1910 |
# print(steps_x) |
1911 |
# print(steps_y) |
1912 |
lines(stepfun(steps_x, steps_y + y_lev), pch="") |
1913 |
abline(y_lev,0) |
1914 |
returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y) |
1915 |
} |
1916 |
# Plot nucs |
1917 |
if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) { |
1918 |
nucs = sample$outputs |
1919 |
if (length(nucs$center) > 0) { |
1920 |
col = 1 |
1921 |
for (i in 1:length(nucs$center)) { |
1922 |
if (change_col) { |
1923 |
col = col + 1 |
1924 |
} else { |
1925 |
col = "blue" |
1926 |
} |
1927 |
nuc = nucs[i,] |
1928 |
involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width) |
1929 |
involved_signs = apply(t(involved_reads[,3]), 2, function(strand) { if (strand == "F") return(1) else return(-1)}) |
1930 |
total_involved_reads = sum(involved_reads[,4]) |
1931 |
if (plot_arrow_nuc_reads ) { |
1932 |
arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, |
1933 |
col=col, length=0.15/nb_rank) |
1934 |
} |
1935 |
if (plot_gaussian_reads | plot_gaussian_unified_reads) { |
1936 |
flatted_reads = flat_reads(involved_reads, nuc$width) |
1937 |
delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width) |
1938 |
} |
1939 |
if (plot_gaussian_reads ) { |
1940 |
flatted_reads = flat_reads(involved_reads, nuc$width) |
1941 |
delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width) |
1942 |
lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) * height/5 + y_lev, col=col) |
1943 |
lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) * height/5 + y_lev, col=col) |
1944 |
} |
1945 |
if (plot_gaussian_unified_reads ) { |
1946 |
lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) * height/5 + y_lev, col=col, lty=1) |
1947 |
} |
1948 |
if (plot_ellipse_nucs) { |
1949 |
# require(plotrix) |
1950 |
draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width * height/5, border=col) |
1951 |
} |
1952 |
} |
1953 |
} else { |
1954 |
print("WARNING! No nucs to print.") |
1955 |
} |
1956 |
} |
1957 |
} |
1958 |
|
1959 |
# Plot wp nucs |
1960 |
if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs | plot_chain)) { |
1961 |
if (samples[[1]]$marker == "Mnase_Seq") { |
1962 |
if (is.null(aggregated_intra_strain_nucs)) { |
1963 |
wp_nucs = aggregate_intra_strain_nucs(samples)[[1]] |
1964 |
} else { |
1965 |
wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]] |
1966 |
} |
1967 |
} else { |
1968 |
wp_nucs = replicates_wp_nucs[[replicate_rank-2]] |
1969 |
} |
1970 |
if (plot_chain) { |
1971 |
tf_nucs = lapply(wp_nucs, function(nuc) { |
1972 |
bar = apply(t(nuc$nucs), 2, function(tmp_nuc){ |
1973 |
tmp_nuc = tmp_nuc[[1]] |
1974 |
tmp_nuc$inputs = NULL |
1975 |
tmp_nuc$original_reads = NULL |
1976 |
tmp_nuc$wp = nuc$wp |
1977 |
# print(tmp_nuc) |
1978 |
return(tmp_nuc) |
1979 |
}) |
1980 |
return(do.call(rbind, bar)) |
1981 |
}) |
1982 |
tf_nucs = data.frame(do.call(rbind, tf_nucs)) |
1983 |
tmp_x = (unlist(tf_nucs$lower_bound) + unlist(tf_nucs$upper_bound)) / 2 |
1984 |
tmp_y = y_min + (unlist(tf_nucs$track) - 0.5) * delta_y/nb_rank |
1985 |
tmp_y_prev = tmp_y[-length(tmp_y)] |
1986 |
tmp_y_next = tmp_y[-1] |
1987 |
tmp_y_inter = (tmp_y_prev + tmp_y_next) / 2 |
1988 |
tmp_track = unlist(tf_nucs$track) |
1989 |
tmp_track_prev = tmp_track[-length(tmp_track)] |
1990 |
tmp_track_next = tmp_track[-1] |
1991 |
# tmp_track_inter = signif(tmp_track_prev - tmp_track_next) * (abs(tmp_track_prev - tmp_track_next) > 1) * 25 |
1992 |
if (is.null(config$TRACK_LLR_OFFSET)) { |
1993 |
config$TRACK_LLR_OFFSET = 0 |
1994 |
} |
1995 |
tmp_track_inter = signif(tmp_track_prev - tmp_track_next) + config$TRACK_LLR_OFFSET * 25 |
1996 |
tmp_x_prev = tmp_x[-length(tmp_x)] |
1997 |
tmp_x_next = tmp_x[-1] |
1998 |
need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50}) |
1999 |
tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift |
2000 |
tmp_llr_inter =signif(unlist(tf_nucs$llr_score)[-1], 2) |
2001 |
new_tmp_x = c() |
2002 |
new_tmp_y = c() |
2003 |
index_odd = 1:length(tmp_x) * 2 - 1 |
2004 |
index_even = (1:(length(tmp_x) - 1)) * 2 |
2005 |
new_tmp_x[index_odd] = tmp_x |
2006 |
new_tmp_y[index_odd] = tmp_y |
2007 |
new_tmp_x[index_even] = tmp_x_inter |
2008 |
new_tmp_y[index_even] = tmp_y_inter |
2009 |
lines(new_tmp_x , new_tmp_y, lwd=2) |
2010 |
points(tmp_x, tmp_y, cex=4, pch=16, col="white") |
2011 |
points(tmp_x, tmp_y, cex=4, lwd=2) |
2012 |
text(tmp_x, tmp_y, 1:nrow(tf_nucs)) |
2013 |
if (is.null(config$LEGEND_LLR_POS)) { |
2014 |
pos = 2 |
2015 |
} else { |
2016 |
pos = config$LEGEND_LLR_POS |
2017 |
} |
2018 |
col_llr = sapply(tmp_llr_inter, function(llr){if (llr < 20 ) return("green") else return("red")}) |
2019 |
text(tmp_x_inter, tmp_y_inter, tmp_llr_inter, cex=1.5, pos=pos, col=col_llr) |
2020 |
|
2021 |
} |
2022 |
|
2023 |
if (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs ) { |
2024 |
replicates_wp_nucs[[replicate_rank]] = wp_nucs |
2025 |
strain = samples[[1]]$strain |
2026 |
wp_maps[[strain]] = flat_aggregated_intra_strain_nucs(wp_nucs, "foo", nb_tracks) |
2027 |
fuzzy_maps[[strain]] = get_intra_strain_fuzzy(wp_maps[[strain]], as.list(samples[[1]]$roi), samples[[1]]$strain, config=config) |
2028 |
|
2029 |
if (plot_fuzzy_nucs) { |
2030 |
fuzzy_map = fuzzy_maps[[strain]] |
2031 |
if (!is.null(fuzzy_map)) { |
2032 |
if (nrow(fuzzy_map) > 0) { |
2033 |
rect(sign(x_min) * fuzzy_map$lower_bound + shift, y_min, sign(x_min) * fuzzy_map$upper_bound + shift, y_max, col=adjustcolor(3, alpha.f = 0.1), border=1) |
2034 |
} |
2035 |
} |
2036 |
} |
2037 |
|
2038 |
if (plot_wp_nucs) { |
2039 |
for (wp_nuc in wp_nucs) { |
2040 |
if (wp_nuc$wp){ |
2041 |
rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1) |
2042 |
if (plot_wp_nuc_model) { |
2043 |
all_original_reads = c() |
2044 |
for(initial_nuc in wp_nuc$nucs) { |
2045 |
all_original_reads = c(all_original_reads, initial_nuc$original_reads) |
2046 |
} |
2047 |
delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound |
2048 |
lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) * height/5 + y_min, col=1) |
2049 |
} |
2050 |
} |
2051 |
} |
2052 |
} |
2053 |
} |
2054 |
} |
2055 |
} |
2056 |
|
2057 |
if (plot_common_nucs) { |
2058 |
if (is.null(aligned_inter_strain_nucs)) { |
2059 |
aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]] |
2060 |
if (!is.null(aligned_inter_strain_nucs)) { |
2061 |
aligned_inter_strain_nucs$cur_index = "foo" |
2062 |
} |
2063 |
} |
2064 |
|
2065 |
#Plot common wp nucs |
2066 |
mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list() |
2067 |
for (replicate_rank in 1:length(replicates)) { |
2068 |
nb_rank[[replicate_rank]] = length(samples) |
2069 |
base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]] |
2070 |
ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]]) |
2071 |
y_min[[replicate_rank]] = min(ylim[[replicate_rank]]) |
2072 |
y_max[[replicate_rank]] = max(ylim[[replicate_rank]]) |
2073 |
delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]] |
2074 |
mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2 |
2075 |
|
2076 |
samples = replicates[[replicate_rank]] |
2077 |
for (sample_rank in 1:length(samples)) { |
2078 |
sample = samples[[sample_rank]] |
2079 |
y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]] |
2080 |
if (sample$roi[["begin"]] < sample$roi[["end"]]) { |
2081 |
x_min[[replicate_rank]] = sample$roi[["begin"]] |
2082 |
x_max[[replicate_rank]] = sample$roi[["end"]] |
2083 |
} else { |
2084 |
x_min[[replicate_rank]] = - sample$roi[["begin"]] |
2085 |
x_max[[replicate_rank]] = - sample$roi[["end"]] |
2086 |
} |
2087 |
shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]] |
2088 |
} |
2089 |
} |
2090 |
print(aligned_inter_strain_nucs) |
2091 |
if (!is.null(aligned_inter_strain_nucs)) { |
2092 |
for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) { |
2093 |
inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,] |
2094 |
tmp_xs = tmp_ys = c() |
2095 |
for (replicate_rank in 1:length(replicates)) { |
2096 |
samples = replicates[[replicate_rank]] |
2097 |
strain = samples[[1]]$strain |
2098 |
tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]]) |
2099 |
tmp_ys = c(tmp_ys, mid_y[[replicate_rank]]) |
2100 |
} |
2101 |
lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]])*8, cex=dev.size()[1]*400/(x_max[[1]]-x_min[[1]]), pch=19) |
2102 |
} |
2103 |
} |
2104 |
|
2105 |
if (plot_common_unrs) { |
2106 |
combi = c(replicates[[1]][[1]]$strain, replicates[[2]][[1]]$strain) |
2107 |
roi = as.list(samples[[1]]$roi) |
2108 |
cur_index = "foo" |
2109 |
common_nuc_results = list() |
2110 |
common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = aligned_inter_strain_nucs |
2111 |
unrs = get_unrs(combi, roi, cur_index, wp_maps, fuzzy_maps, common_nuc_results, config = config) |
2112 |
rect(sign(x_min[[1]]) * unrs$lower_bound + shift[[1]], y_min[[1]], sign(x_min[[1]]) * unrs$upper_bound + shift[[1]], y_max[[2]], border=4, lwd=10, col=adjustcolor(4, alpha.f = 0.05)) |
2113 |
} |
2114 |
|
2115 |
} |
2116 |
return(returned_list) |
2117 |
} |
2118 |
|
2119 |
|
2120 |
|
2121 |
get_intra_strain_fuzzy = function(# Compute the fuzzy list for a given strain. |
2122 |
### This function grabs the nucleosomes detxted by template_filter that have been rejected bt aggregate_intra_strain_nucs as well positions. |
2123 |
wp_map, ##<< Well positionned nucleosomes map. |
2124 |
roi, ##<< The region of interest. |
2125 |
strain, ##<< The strain we want to extracvt the fuzzy map. |
2126 |
config=NULL ##<< GLOBAL config variable. |
2127 |
) { |
2128 |
fuzzy_map = wp_map[wp_map$wp==0, ] |
2129 |
if (nrow(fuzzy_map) > 0) { |
2130 |
fuzzy_map = substract_region(fuzzy_map, wp_map[wp_map$wp==1,]) |
2131 |
if (!is.null(fuzzy_map)) { |
2132 |
fuzzy_map = union_regions(fuzzy_map) |
2133 |
fuzzy_map = crop_fuzzy(fuzzy_map, roi, strain, config) |
2134 |
} |
2135 |
} |
2136 |
return(fuzzy_map) |
2137 |
} |
2138 |
|
2139 |
|
2140 |
|
2141 |
|
2142 |
|
2143 |
get_unrs = function(# Compute the unaligned nucleosomal regions (UNRs). |
2144 |
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting UNR. It will be take into account in the count read part og the pipeline. |
2145 |
combi, ##<< The strain combination to consider. |
2146 |
roi, ##<< The region of interest. |
2147 |
cur_index, ##<< The region of interest index. |
2148 |
wp_maps, ##<< Well positionned nucleosomes maps. |
2149 |
fuzzy_maps, ##<< Fuzzy nucleosomes maps. |
2150 |
common_nuc_results, ##<< Common wp nuc maps |
2151 |
config=NULL ##<< GLOBAL config variable |
2152 |
) { |
2153 |
# print(cur_index) |
2154 |
|
2155 |
tmp_combi_key = paste(combi[1], combi[2], sep="_") |
2156 |
tmp_common_nucs = common_nuc_results[[tmp_combi_key]] |
2157 |
tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$cur_index == cur_index, ] |
2158 |
|
2159 |
# print(paste("Dealing with unr from", combi[1])) |
2160 |
tmp_fuzzy = fuzzy_maps[[combi[1]]] |
2161 |
tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ] |
2162 |
tmp_wp = wp_maps[[combi[1]]] |
2163 |
tmp_wp = tmp_wp[tmp_wp$wp==1,] |
2164 |
tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ] |
2165 |
# Let's go! |
2166 |
tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) { |
2167 |
if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) { |
2168 |
return(NULL) |
2169 |
} else { |
2170 |
return (index_nuc) |
2171 |
} |
2172 |
})) |
2173 |
tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ] |
2174 |
tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4]) |
2175 |
if (length(tmp_unr) != 0) { |
2176 |
tmp_unr = union_regions(tmp_unr) |
2177 |
} |
2178 |
tmp_unr_nucs_1 = tmp_unr |
2179 |
if (length(tmp_unr_nucs_1[,1]) == 0) {return(NULL)} |
2180 |
agg_unr_1 = tmp_unr_nucs_1 |
2181 |
|
2182 |
# print(paste("Dealing with unr from ", combi[2])) |
2183 |
tmp_fuzzy = fuzzy_maps[[combi[2]]] |
2184 |
tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ] |
2185 |
tmp_wp = wp_maps[[combi[2]]] |
2186 |
tmp_wp = tmp_wp[tmp_wp$wp==1,] |
2187 |
tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ] |
2188 |
# Let's go! |
2189 |
tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) { |
2190 |
if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) { |
2191 |
return(NULL) |
2192 |
} else { |
2193 |
return (index_nuc) |
2194 |
} |
2195 |
})) |
2196 |
tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ] |
2197 |
tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4]) |
2198 |
if (length(tmp_unr) != 0) { |
2199 |
tmp_unr = union_regions(tmp_unr) |
2200 |
} |
2201 |
tmp_unr_nucs_2 = tmp_unr |
2202 |
if (length(tmp_unr_nucs_2[,1]) == 0) {return(NULL)} |
2203 |
agg_unr_2 = crop_fuzzy(tmp_unr_nucs_2, roi, combi[2], config) |
2204 |
tr_agg_unr_2 = translate_regions(agg_unr_2, combi, cur_index, roi=roi, config=config) |
2205 |
tr_agg_unr_2 = union_regions(tr_agg_unr_2) |
2206 |
|
2207 |
# print("Dealing with unr from both...") |
2208 |
all_unr = union_regions(rbind(agg_unr_1, tr_agg_unr_2)) |
2209 |
|
2210 |
# print(paste("Dealing with wp from", combi[1])) |
2211 |
tmp_wp = wp_maps[[combi[1]]] |
2212 |
tmp_wp = tmp_wp[tmp_wp$wp==1,] |
2213 |
tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ] |
2214 |
# Let's go! |
2215 |
tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) { |
2216 |
if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) { |
2217 |
return (index_nuc) |
2218 |
} else { |
2219 |
return(NULL) |
2220 |
} |
2221 |
})) |
2222 |
wp_nucs_1 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ] |
2223 |
|
2224 |
# print(paste("Dealing with wp from", combi[2])) |
2225 |
tmp_wp = wp_maps[[combi[2]]] |
2226 |
tmp_wp = tmp_wp[tmp_wp$wp==1,] |
2227 |
tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ] |
2228 |
# Let's go! |
2229 |
tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) { |
2230 |
if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) { |
2231 |
return (index_nuc) |
2232 |
} else { |
2233 |
return(NULL) |
2234 |
} |
2235 |
})) |
2236 |
wp_nucs_2 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ] |
2237 |
wp_nucs_2 = crop_fuzzy(wp_nucs_2, roi, combi[2], config) |
2238 |
if (nrow(wp_nucs_2) == 0) { |
2239 |
tr_wp_nucs_2 = wp_nucs_2 |
2240 |
} else { |
2241 |
tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, cur_index, roi=roi, config=config) |
2242 |
} |
2243 |
|
2244 |
# print("Dealing with wp from both...") |
2245 |
all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2)) |
2246 |
|
2247 |
# print("Dealing with unr and wp...") |
2248 |
non_inter_unr = substract_region(all_unr, all_wp) |
2249 |
non_inter_unr = crop_fuzzy(non_inter_unr, roi, combi[1], config) |
2250 |
if (is.null(non_inter_unr)) { return(NULL) } |
2251 |
non_inter_unr$len = non_inter_unr$upper_bound - non_inter_unr$lower_bound |
2252 |
min_unr_width = 75 |
2253 |
non_inter_unr = non_inter_unr[non_inter_unr$len >= min_unr_width,] |
2254 |
|
2255 |
non_inter_unr$index_nuc = 1:length(non_inter_unr[,1]) |
2256 |
return (non_inter_unr) |
2257 |
} |
2258 |
|
2259 |
|
2260 |
|