Statistiques
| Branche: | Révision :

root / src / R / nucleominer.R @ d973538c

Historique | Voir | Annoter | Télécharger (85,82 ko)

1
FDR = structure(function#  False Discovery Rate
2
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper
3
##author<< Gael Yvert,
4
(
5
x, ##<< A vector x of independent p-values.
6
FDR ##<< The specified FDR.
7
) {
8
  x <- sort(na.omit(x))
9
  N = length(x)
10
  i = 1;
11
  while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR
12
  if (i == 1)
13
    return (NA)
14
  else
15
    return( x[i-1] )
16
### Return the the corresponding cutoff.
17
}, ex=function(){
18
  print("example")
19
})
20

    
21

    
22

    
23
llr_score_nvecs = structure(function # Likelihood ratio
24
### Compute the log likelihood ratio of two or more set of value.
25
(
26
  xs ##<< list of vectors.
27
) {
28
  l = length(xs)
29
  if (l < 1) {
30
    return(NA)
31
  }
32
  if (l == 1) {
33
    return(1)
34
  }
35
  sumllX = 0
36
  for (i in 1:l) {
37
    x = xs[[i]]
38
  	if (length(x) <= 1) {
39
  		return(NA)
40
  	}
41
    meanX = mean(x)
42
    sdX = sd(x)
43
    llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
44
    sumllX = sumllX + llX
45
  }
46
  meanXglo = mean(unlist(xs))
47
  sdXglo = sd(unlist(xs))
48
  llXYZ = sum(log(dnorm(unlist(xs),mean=meanXglo,sd=sdXglo)))
49
  ratio = sumllX - llXYZ
50
  return(ratio)
51
### Returns the log likelihood ratio.
52
}, ex=function(){
53
  # LLR score for 2 set of values
54
  mean1=5; sd1=2; card2 = 250
55
  mean2=6; sd2=3; card1 = 200
56
  x1 = rnorm(card1, mean1, sd1)
57
  x2 = rnorm(card2, mean2, sd2)
58
  min = floor(min(c(x1,x2)))
59
  max = ceiling(max(c(x1,x2)))
60
  hist(c(x1,x2), xlim=c(min, max), breaks=min:max)
61
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
62
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
63
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
64
  llr_score_nvecs(list(x1,x2))
65
 })
66

    
67
dfadd = structure(function# Adding list to a dataframe.
68
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}.
69
	(df, ##<<  A dataframe
70
		l ##<<  A list
71
	) {
72
  if (is.null(df)) {
73
    df = data.frame(l,stringsAsFactors=FALSE)
74
  } else {
75
    df = rbind(df, data.frame(l,stringsAsFactors=FALSE))
76
  }
77
  return(df)
78
### Return the dataframe \emph{df}.
79
}, ex=function(){
80
		## Here dataframe is NULL
81
		print(df)
82
		df = NULL
83

    
84
		# Initialize df
85
		df = dfadd(df, list(key1 = "value1", key2 = "value2"))
86
		print(df)
87

    
88
		# Adding elements to df
89
		df = dfadd(df, list(key1 = "value1'", key2 = "value2'"))
90
		print(df)
91
})
92

    
93

    
94
sign_from_strand = function(
95
### Get the sign of strand
96
strands) {
97
	apply(t(strands), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
98
### If strand in forward then returns 1 else returns -1
99
}
100

    
101
flat_reads = function(
102
### Extract reads coordinates from TempleteFilter input sequence
103
reads, ##<< TemplateFilter input reads
104
nuc_width ##<< Width used to shift F and R reads.
105
) {
106
	F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])}))
107
	R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])}))
108
	flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads))  )
109
	return(list(F_flatted_reads, R_flatted_reads, flatted_reads))
110
### Returns a list of F reads, R reads and joint/shifted F and R reads.
111
}
112

    
113

    
114

    
115
filter_tf_outputs = function(# Filter TemplateFilter outputs
116
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlapping threshold.
117
tf_outputs, ##<< TemplateFilter outputs.
118
chr, ##<< Chromosome observed, here chr is an integer.
119
x_min, ##<< Coordinate of the first bp observed.
120
x_max, ##<< Coordinate of the last bp observed.
121
nuc_width = 160, ##<< Nucleosome width.
122
ol_bp = 59, ##<< Overlap Threshold.
123
corr_thres = 0.5 ##<< Correlation threshold.
124
) {
125
  if (x_min < 0) {
126
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= -x_max & tf_outputs$center + tf_outputs$width/2 <=  -x_min,]
127
	} else {
128
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= x_min & tf_outputs$center + tf_outputs$width/2 <= x_max,]
129
  }
130
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
131
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
132
  tf_outputs = tf_outputs[tf_outputs$correlation.score >= corr_thres,]
133
  tf_outputs = tf_outputs[order(tf_outputs$correlation.score, decreasing=TRUE),]
134
  i = 1
135
  while (i <= length(tf_outputs[,1])) {
136
    lb = tf_outputs[i,]$lower_bound
137
    ub = tf_outputs[i,]$upper_bound
138
    tf_outputs = tf_outputs[!(tf_outputs$lower_bound <= (ub-ol_bp) & tf_outputs$upper_bound > ub) & !(tf_outputs$upper_bound >= (lb+ol_bp) & tf_outputs$lower_bound < lb),]
139
    i = i+1
140
  }
141
  return(tf_outputs)
142
### Returns filtered TemplateFilter Outputs
143
}
144

    
145

    
146

    
147
filter_tf_inputs = function(# Filter TemplateFilter inputs
148
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
149
inputs, ##<< TF inputs to be filtered.
150
chr, ##<< Chromosome observed, here chr is an integer.
151
x_min, ##<< Coordinate of the first bp observed.
152
x_max, ##<< Coordinate of the last bp observed.
153
nuc_width = 160, ##<< Nucleosome width.
154
only_f = FALSE, ##<< Filter only F reads.
155
only_r = FALSE, ##<< Filter only R reads.
156
filter_for_coverage = FALSE, ##<< Does it filter for plot coverage?
157
USE_DPLYR = TRUE ##<< Use dplyr lib to filter reads.
158
) {
159
  n = names(inputs)
160

    
161
  if (!USE_DPLYR) {  
162
    if (only_f) {
163
      inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
164
    } else if (only_r) {
165
      inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]
166
    } else {
167
      inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
168
    }
169
  } else {
170
    names(inputs) = c("chr", "pos", "str", "lev")
171
    if (only_f) {
172
      inputs_out = filter(inputs, chr == chr,  pos >= x_min - nuc_width, str == "F", pos <= x_max + nuc_width)
173
    } else if (only_r) {
174
      inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, str == "R" & pos <= x_max + nuc_width)
175
    } else {
176
      inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, pos <= x_max + nuc_width)
177
    }
178
      # if (!filter_for_coverage) {
179
      #   inputs$corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
180
      #   inputs = filter(inputs, chr == chr, corrected_inputs_coords >= x_min, corrected_inputs_coords <= x_max)
181
      #   inputs$corrected_inputs_coords = NULL
182
      # }  
183
  }
184

    
185
  if (!filter_for_coverage) {
186
    corrected_inputs_coords = inputs_out[,2] + nuc_width/2 * sign_from_strand(inputs_out[,3])
187
    inputs_out = inputs_out[inputs_out[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]
188
  }
189

    
190
  names(inputs_out) = n 
191
	return(inputs_out)
192
### Returns filtred inputs.
193
}
194

    
195

    
196

    
197
get_comp_strand = function(
198
### Compute the complementatry strand.
199
strand ##<< The original strand.
200
) {
201
	apply(t(strand),2, function(n){
202
	  if (n=="a") {return("t")}
203
		if (n=="t") {return("a")}
204
		if (n=="c") {return("g")}
205
		if (n=="g") {return("c")}
206
	})
207
### Returns the complementatry strand.
208
}
209

    
210

    
211
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes.
212
### This function aggregates nucleosomes from replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center (dyad). A chain of nucleosomes is builts across all replicates. Adjacent nucleosomes of the chain are compared two by two. Comparison is based on a log likelihood ratio (LLR1). depending on the LLR1 value nucleosomes are merged (low LLR) or separated (high LLR). Finally the function returns a list of clusters and all computed llr_scores. Each cluster ows an attribute wp for "well positioned". This attribute is set to TRUE if the cluster is composed of exactly one nucleosome of each sample.
213
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
214
llr_thres=20, ##<< Log likelihood ratio threshold to decide between merging and separating
215
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
216
){
217
	end_of_tracks = function(tracks) {
218
		if (length(tracks) == 0) {
219
			return(TRUE)
220
		}
221
	  for (lower_bound in tracks) {
222
			if(!is.na(lower_bound)) {
223
	      if (lower_bound < coord_max) {
224
	        return(FALSE)
225
	      }
226
	  	}
227
	  }
228
	  return(TRUE)
229
	}
230
	store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center) {
231
		if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) {
232
			new_cluster$wp = TRUE
233
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
234
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
235
		  	clusters[[length(clusters) + 1]] = new_cluster
236
				# print(new_cluster)
237
		  }
238
		} else {
239
			new_cluster$wp = FALSE
240
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
241
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
242
			  clusters[[length(clusters) + 1]] = new_cluster
243
			}
244
		}
245
		return(clusters)
246
	}
247
	strain = samples[[1]]$strain
248
	llr_scores = c()
249
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
250
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
251
  # compute clusters
252
  clusters = list()
253
  cluster_contents = list()
254
  # Init reader
255
  indexes = c()
256
  track_readers = c()
257
  current_nuc = NULL
258
	llr_score = llr_thres + 1
259
  # Read nucs from TF outputs
260
  tf_outs = list()
261
	i = 1
262
  for (sample in samples) {
263
		tf_outs[[i]] = sample$outputs
264
		tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),]
265
    indexes[i] = 1
266
		if (is.na(tf_outs[[i]][indexes[i],]$center)) {
267
      track_readers[i] = coord_max
268
	  } else {
269
      track_readers[i] = tf_outs[[i]][indexes[i],]$center
270
		}
271
		i = i+1
272
  }
273
  nb_tracks = length(tf_outs)
274
	# print(track_readers)
275
  new_cluster = NULL
276
  nb_nucs_in_cluster = 0
277
  nuc_from_track = c()
278
  for (i in 1:nb_tracks){
279
    nuc_from_track[i] = FALSE
280
  }
281
  # Start clustering
282
  while (!end_of_tracks(track_readers)) {
283
    new_center = min(track_readers)
284
		current_track = which(track_readers == new_center)[1]
285
    new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],])
286
		new_nuc$chr = substr(new_nuc$chr,4,1000000L)
287
		new_nuc$inputs = samples[[current_track]]$inputs
288
		new_nuc$chr = samples[[current_track]]$roi$chr
289
		new_nuc$track = current_track
290

    
291
		new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width)
292
		flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width)
293
		new_nuc$original_reads = flatted_reads[[3]]
294

    
295
    new_upper_bound = new_nuc$upper_bound
296

    
297
    if (!is.null(current_nuc)) {
298
			llr_score = llr_score_nvecs(list(current_nuc$original_reads,new_nuc$original_reads))
299
			llr_scores = c(llr_scores,llr_score)
300
		}
301
		# print(paste(llr_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
302
		if (is.na(llr_score)) {
303
			llr_score = llr_thres + 1
304
		}
305
		# Store llr_score
306
		new_nuc$llr_score = llr_score
307
	  if (llr_score < llr_thres) {
308
      # aggregate to current cluster
309
      #   update bound
310
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
311
        new_cluster$upper_bound = new_nuc$upper_bound
312
      }
313
      if (new_nuc$lower_bound < new_cluster$lower_bound) {
314
        new_cluster$lower_bound = new_nuc$lower_bound
315
      }
316
      #   add nucleosome to current cluster
317
      nuc_from_track[current_track] = TRUE
318
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
319
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
320
    } else {
321
			if (!is.null(new_cluster)) {
322
        # store old cluster
323
	      clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center)
324
			}
325
      # Reinit current cluster composition stuff
326
      nb_nucs_in_cluster = 0
327
      nuc_from_track = c()
328
      for (i in 1:nb_tracks){
329
        nuc_from_track[i] = FALSE
330
      }
331
      # create new cluster
332
      new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list())
333
      # update upper bound
334
      current_upper_bound = new_upper_bound
335
      # add nucleosome to current cluster
336
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
337
      nuc_from_track[current_track] = TRUE
338
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
339

    
340
		}
341

    
342
		current_nuc = new_nuc
343

    
344
    # update indexes
345
    if (indexes[current_track] < length(tf_outs[[current_track]]$center)) {
346
      indexes[current_track] = indexes[current_track] + 1
347
      # update track
348
      track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center
349
    } else {
350
      # update track
351
      track_readers[current_track] = coord_max
352
    }
353
  }
354
  # store last cluster
355
  if (!is.null(new_cluster)) {
356
    # store old cluster
357
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center)
358
  }
359
	return(list(clusters, llr_scores))
360
### Returns a list of clusterized nucleosomes, and all computed llr scores.
361
}, ex=function(){
362
	# Dealing with a region of interest
363
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
364
	samples = list()
365
	for (i in 1:3) {
366
		# Create TF output
367
		tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
368
		outputs = dfadd(NULL,tf_nuc)
369
		outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
370
		# Generate corresponding reads
371
		nb_reads = round(runif(1,170,230))
372
		reads = round(rnorm(nb_reads, tf_nuc$center,20))
373
		u_reads = sort(unique(reads))
374
		strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
375
		counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
376
		shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
377
		u_reads = u_reads + shifts
378
		inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
379
		                         "V2" = u_reads,
380
														 "V3" = strands,
381
														 "V4" = counts), stringsAsFactors=FALSE)
382
		samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
383
	}
384
	print(aggregate_intra_strain_nucs(samples))
385
})
386

    
387
flat_aggregated_intra_strain_nucs = function(# to flat aggregate_intra_strain_nucs function output
388
### This function builds a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
389
partial_strain_maps, ##<< the output of aggregate_intra_strain_nucs function
390
cur_index, ##<< the index of the roi involved
391
nb_tracks=3 ##<< the number of replicates
392
) {
393
	if  (length(partial_strain_maps) == 0 ){
394
		print(paste("Empty partial_strain_maps for roi", cur_index, "ands current strain." ))
395
    tmp_strain_maps = list()
396
	} else {
397
		tmp_strain_map = apply(t(1:length(partial_strain_maps)), 2, function(i){
398
			tmp_nuc = partial_strain_maps[[i]]
399
			tmp_nuc_as_list = list()
400
			tmp_nuc_as_list[["chr"]] = tmp_nuc[["chr"]]
401
			tmp_nuc_as_list[["lower_bound"]] = ceiling(tmp_nuc[["lower_bound"]])
402
			tmp_nuc_as_list[["upper_bound"]] = floor(tmp_nuc[["upper_bound"]])
403
			tmp_nuc_as_list[["cur_index"]] = cur_index
404
			tmp_nuc_as_list[["index_nuc"]] = i
405
			tmp_nuc_as_list[["wp"]] = as.integer(tmp_nuc$wp)
406
			all_original_reads = c()
407
			for (j in 1:length(tmp_nuc$nucs)) {
408
				all_original_reads = c(all_original_reads, tmp_nuc$nucs[[j]]$original_reads)
409
			}
410
			tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads)
411
			tmp_nuc_as_list[["nb_nucs"]] = length(tmp_nuc$nucs)
412
			if (tmp_nuc$wp) {
413
        for (i in 1:(nb_tracks-1)) {
414
  				tmp_nuc_as_list[[paste("llr", i, sep="_")]] = signif(tmp_nuc$nucs[[i + 1]]$llr_score,5)          
415
        }
416
			} else {
417
        for (i in 1:(nb_tracks-1)) {
418
  				tmp_nuc_as_list[[paste("llr", i, sep="_")]] = NA
419
        }
420
			}
421
      return(tmp_nuc_as_list)
422
    })
423
    tmp_strain_maps = do.call("rbind", tmp_strain_map)
424
	}
425
  return(data.frame(lapply(data.frame(tmp_strain_maps, stringsAsFactors=FALSE), unlist), stringsAsFactors=FALSE))
426
### Returns a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
427
}
428

    
429
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains.
430
### This function aligns nucleosomes between two strains for a given genome region.
431
replicates, ##<< Set of replicates, ideally 3 per strain.
432
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's NULL this list will be computed.
433
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's NULL this list will be computed.
434
corr_thres=0.5, ##<< Correlation threshold.
435
llr_thres=100, ##<< Log likelihood ratio threshold to decide between merging and separating
436
config=NULL, ##<< GLOBAL config variable
437
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.
438
) {
439
	if (length(replicates) < 2) {
440
		stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.")
441
	} else if (length(replicates) > 2) {
442
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")
443
	}
444
	common_nuc = NULL
445
	llr_scores = c()
446
	chr = replicates[[1]][[1]]$roi$chr
447
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
448
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
449
	strain_ref1 = replicates[[1]][[1]]$strain
450
	strain_ref2 = replicates[[2]][[1]]$strain
451
	big_cur = replicates[[1]][[1]]$roi
452
  orig_big_cur = replicates[[1]][[1]]$orig_roi
453
	if(big_cur$end - big_cur$begin < 0) {
454
		tmp_begin = big_cur$begin
455
		big_cur$begin =  big_cur$end
456
		big_cur$end =  tmp_begin
457
	}
458
	# GO!
459
	if (is.null(wp_nucs_strain_ref1)) {
460
		wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]]
461
	}
462
	if (is.null(wp_nucs_strain_ref2)) {
463
	  wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]]
464
  }
465
	lws = c()
466
	ups = c()
467
	for (na in wp_nucs_strain_ref2) {
468
		lws = c(lws, na$lower_bound)
469
		ups = c(ups, na$upper_bound)
470
	}
471

    
472
	print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep=""))
473
	roi_strain_ref1 = NULL
474
	roi_strain_ref2 = NULL
475
	if (length(wp_nucs_strain_ref1) > 0) {
476
		for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){
477
			# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep=""))
478
			nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]]
479
			# Filtering on Well Positionned
480
			if (nuc_strain_ref1$wp) {
481
				roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1)
482
				roi_strain_ref2 = translate_cur(roi_strain_ref1, strain_ref2, big_cur=orig_big_cur, config=config)
483
        if (!is.null(roi_strain_ref2)){
484
					# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER)
485
					lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin)
486
					upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin)
487
					matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2)
488
					for (index_nuc_strain_ref2 in matching_nas) {
489
						nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]]
490
						# Filtering on Well Positionned
491
    				nuc_strain_ref2_to_roi = list(begin=nuc_strain_ref2$lower_bound, end=nuc_strain_ref2$upper_bound, chr=nuc_strain_ref2$chr, strain_ref = strain_ref2)
492
						if (!is.null(translate_cur(nuc_strain_ref2_to_roi, strain_ref1, big_cur=orig_big_cur, config=config)) &
493
                nuc_strain_ref2$wp) {
494
							# Filtering on correlation Score and collecting reads
495
							SKIP = FALSE
496
							# TODO: This for loop could be done before working on strain_ref2. Isn't it?
497
							reads_strain_ref1 = c()
498
							for (nuc in nuc_strain_ref1$nucs){
499
								reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads)
500
								if (nuc$corr < corr_thres) {
501
									SKIP = TRUE
502
								}
503
							}
504
							reads_strain_ref2 = c()
505
							for (nuc in nuc_strain_ref2$nucs){
506
								reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads)
507
								if (nuc$corr < corr_thres) {
508
									SKIP = TRUE
509
								}
510
							}
511
							# Filtering on correlation Score
512
							if (!SKIP) {
513
								# tranlation of reads into strain 2 coords
514
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
515
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
516
								llr_score = llr_score_nvecs(list(reads_strain_ref1, reads_strain_ref2))
517
								llr_scores = c(llr_scores, llr_score)
518
								# Filtering on LLR Score
519
                if (llr_score < llr_thres) {
520
									tmp_nuc = list()
521
									# strain_ref1
522
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
523
									tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound
524
									tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound
525
									tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5)
526
									tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5)
527
									tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1)
528
									tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1
529
									# strain_ref2
530
									tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr
531
									tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound
532
									tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound
533
									tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5)
534
									tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5)
535
									tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2)
536
									tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2
537
									# common
538
									tmp_nuc[["llr_score"]] = signif(llr_score,5)
539
                  tmp_nuc[["common_wp_pval"]] = signif(1-pchisq(2*tmp_nuc[["llr_score"]], df=2),5)
540
									tmp_nuc[["diff"]] = abs(abs((roi_strain_ref2$begin - roi_strain_ref2$end)) / 2 - abs((nuc_strain_ref2$lower_bound - nuc_strain_ref2$upper_bound)) / 2)
541
									common_nuc = dfadd(common_nuc, tmp_nuc)
542
                }
543
							}
544
						}
545
					}
546
				} else {
547
		      print("WARNING! No roi for strain ref 2.")
548
			  }
549
		  }
550
		}
551

    
552
    if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) {
553
      index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] ==  common_nuc[,1:3][-1,] ,1,sum) == 3)
554
      to_remove_list = c()
555
      for (i in 1:length(index_redundant)) {
556
        if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
557
          to_remove = index_redundant[i]
558
        }   else {
559
          to_remove = index_redundant[i] + 1
560
        }
561
        to_remove_list = c(to_remove_list, to_remove)
562
      }
563
      common_nuc = common_nuc[-to_remove_list,]
564
    }
565

    
566
    if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) {
567
      index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] == common_nuc[,8:10][-1,] ,1,sum) == 3)
568
      to_remove_list = c()
569
      for (i in 1:length(index_redundant)) {
570
        if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
571
          to_remove = index_redundant[i]
572
        }   else {
573
          to_remove = index_redundant[i] + 1
574
        }
575
        to_remove_list = c(to_remove_list, to_remove)
576
      }
577
      common_nuc = common_nuc[-to_remove_list,]
578
    }
579

    
580
		return(list(common_nuc, llr_scores))
581
	} else {
582
		print("WARNING, no nucs for strain_ref1.")
583
		return(NULL)
584
	}
585
### Returns a list of clusterized nucleosomes, and all computed llr scores.
586
}, ex=function(){
587

    
588
    # Define new translate_cur function...
589
    translate_cur = function(roi, strain2, big_cur=NULL, config=NULL) {
590
      return(roi)
591
    }
592
    # Binding it by uncomment follwing lines.
593
    unlockBinding("translate_cur", as.environment("package:nucleominer"))
594
    unlockBinding("translate_cur", getNamespace("nucleominer"))
595
    assign("translate_cur", translate_cur, "package:nucleominer")
596
    assign("translate_cur", translate_cur, getNamespace("nucleominer"))
597
    lockBinding("translate_cur", getNamespace("nucleominer"))
598
    lockBinding("translate_cur", as.environment("package:nucleominer"))
599

    
600
	# Dealing with a region of interest
601
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1")
602
	roi2 = translate_cur(roi, roi$strain_ref1)
603
	replicates = list()
604
	for (j in 1:2) {
605
		samples = list()
606
		for (i in 1:3) {
607
			# Create TF output
608
			tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
609
			outputs = dfadd(NULL,tf_nuc)
610
			outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
611
			# Generate corresponding reads
612
			nb_reads = round(runif(1,170,230))
613
			reads = round(rnorm(nb_reads, tf_nuc$center,20))
614
			u_reads = sort(unique(reads))
615
			strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
616
			counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
617
			shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
618
			u_reads = u_reads + shifts
619
			inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
620
			                         "V2" = u_reads,
621
															 "V3" = strands,
622
															 "V4" = counts), stringsAsFactors=FALSE)
623
			samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
624
		}
625
		replicates[[length(replicates) + 1]] = samples
626
	}
627
	print(align_inter_strain_nucs(replicates))
628
})
629

    
630

    
631

    
632

    
633

    
634

    
635

    
636

    
637

    
638

    
639

    
640

    
641

    
642

    
643

    
644

    
645

    
646

    
647

    
648

    
649

    
650

    
651

    
652

    
653

    
654

    
655
fetch_mnase_replicates = function(# Prefetch data
656
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates.
657
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids.
658
roi, ##<< Region of interest.
659
all_samples, ##<< Global list of samples.
660
config=NULL, ##<< GLOBAL config variable
661
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking.
662
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence.
663
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files.
664
) {
665
	samples=list()
666
  samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id)
667
	for (i in samples_ids) {
668
		sample = as.list(all_samples[all_samples$id==i,])
669
    sample$orig_roi = roi
670
    sample$roi = translate_cur(roi, sample$strain, config = config)
671
		if (get_genome) {
672
			# Get Genome
673
      sample$roi$genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]], "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
674
		}
675
		# Get inputs
676
		sample$inputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), "table", stringsAsFactors=FALSE)
677
		sample$total_reads = sum(sample$inputs[,4])
678
		if (!only_fetch) {
679
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
680
	  }
681
	  # Get TF outputs for Mnase_Seq samples
682
		if (sample$marker == "Mnase_Seq" & get_ouputs) {
683
			sample$outputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), "table", header=TRUE, sep="\t")
684
			if (!only_fetch) {
685
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
686
  		}
687
		}
688
		samples[[length(samples) + 1]] = sample
689
	}
690
  return(samples)
691
}
692

    
693
substract_region = function(# Substract to a list of regions an other list of regions that intersect it.
694
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two.
695
region1, ##<< Original regions.
696
region2 ##<< Regions to substract.
697
) {
698
  rec_substract_region = function(region1, region2) {
699
  non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) {
700
    cur_fuzzy = region1[i,]
701
    inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,]
702
    if (length(inter_wp[,1]) > 0) {
703
      ret = c()
704
      for (j in 1:length(inter_wp[,1])) {
705
        cur_wp = inter_wp[j,]
706
        if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
707
          # remove cur_fuzzy
708
          ret = c()
709
          break
710
        } else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
711
          # crop fuzzy
712
          cur_fuzzy$lower_bound = cur_wp$upper_bound + 1
713
          ret = cur_fuzzy
714
        } else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
715
          # crop fuzzy
716
          cur_fuzzy$upper_bound = cur_wp$lower_bound - 1
717
          ret = cur_fuzzy
718
        } else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
719
          # split fuzzy
720
          tmp_ret_fuzzy_1 = cur_fuzzy
721
          tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1
722
          tmp_ret_fuzzy_2 = cur_fuzzy
723
          tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1
724
          ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp)
725
          # print(ret)
726
          # ret = cur_fuzzy
727
          break
728
        } else {
729
          stop("WARNING NO ADAPTED CASE!")
730
        }
731
      }
732
      return(ret)
733
    } else {
734
      return(cur_fuzzy)
735
    }
736
  })
737
  }
738
  non_inter_fuzzy = rec_substract_region(region1[,1:4], region2[,1:4])
739
  if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)}
740
  tmp_ulist = unlist(non_inter_fuzzy)
741
  tmp_names = names(tmp_ulist)[1:4]
742
  non_inter_fuzzy = data.frame(matrix(tmp_ulist, ncol=4, byrow=TRUE), stringsAsFactors=FALSE)
743
  names(non_inter_fuzzy) = tmp_names
744
  non_inter_fuzzy$chr = as.character(non_inter_fuzzy$chr)
745
  non_inter_fuzzy$chr = as.numeric(non_inter_fuzzy$chr)
746
  non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound)
747
  non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound)
748
  non_inter_fuzzy = non_inter_fuzzy[order(non_inter_fuzzy$lower_bound),]
749
  return(non_inter_fuzzy)
750
}
751

    
752
union_regions = function(# Aggregate regions that intersect themselves.
753
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable.
754
regions ##<< The Regions to be aggregated
755
) {
756
  if (is.null(regions)) {return(regions)}
757
  if (nrow(regions) == 0) {return(regions)}
758
  old_length = length(regions[,1])
759
  new_length = 0
760

    
761
  while (old_length != new_length) {
762
    regions = regions[order(regions$lower_bound), ]
763
    regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 1, TRUE)
764
    vec_end_1 = which(regions$stop)
765
    if (length(vec_end_1) == 0) {
766
      vec_end_1 = c(length(regions$stop))
767
    }
768
    if (vec_end_1[length(vec_end_1)] != length(regions$stop)) {
769
      vec_end_1 = c(vec_end_1, length(regions$stop))
770
    }
771
    vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1)
772
    union = apply(t(1:length(vec_beg_1)), 2, function(i) {
773
      chr = regions$chr[vec_beg_1[i]]
774
      lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]])
775
      upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]])
776
      cur_index = regions$cur_index[vec_beg_1[i]]
777
      data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, cur_index=cur_index))
778
      })
779
    union = collapse_regions(union)
780
    old_length = length(regions[,1])
781
    new_length = length(union[,1])
782
    regions = union
783
  }
784
  return(union)
785
}
786

    
787
# remove_aligned_wp = function(# Remove wp nucs from common nucs list.
788
# ### It is based on common wp nucs index on nucs and region.
789
# strain_maps, ##<< Nuc maps.
790
# cur_index, ##<< The region of interest index.
791
# tmp_common_nucs, ##<< the list of wp nucs.
792
# strain##<< The strain to consider.
793
# ){
794
#   fuzzy_nucs = strain_maps[[strain]]
795
#   fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$cur_index == cur_index,]
796
#   fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),]
797
#   if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)}
798
#   if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"}
799
#   anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]
800
#   fuzzy_nucs = fuzzy_nucs[-anti_index_1,]
801
#   return(fuzzy_nucs)
802
# }
803

    
804
translate_regions = function(# Translate a list of regions from a strain ref to another.
805
### This function is an elaborated call to translate_cur.
806
regions, ##<< Regions to be translated.
807
combi, ##<< Combination of strains.
808
cur_index, ##<< The region of interest index.
809
config=NULL, ##<< GLOBAL config variable
810
roi ##<< The region of interest.
811
) {
812
  tr_regions = apply(t(1:length(regions[,1])), 2, function(i) {
813
    tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2])
814
    big_cur =  roi
815
    trs_tmp_regions_ref2 = translate_cur(tmp_regions_ref2, combi[1], config = config, big_cur = big_cur)
816
    if (is.null(trs_tmp_regions_ref2)) {
817
      return(NULL)
818
    } else {
819
      return(data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), cur_index=cur_index)))
820
    }
821
  })
822

    
823
  return(collapse_regions(tr_regions))
824
}
825

    
826
collapse_regions = function(# reformat an "apply  manipulated" list of regions
827
### Utils to reformat an "apply  manipulated" list of regions
828
regions ##< a list of regions
829
) {
830
  if (is.null(regions)) {
831
    return(NULL)
832
  } else {
833
    regions = do.call(rbind, regions)
834
    regions$chr = as.character(regions$chr)
835
    regions$chr = as.numeric(regions$chr)
836
    regions$lower_bound = as.numeric(regions$lower_bound)
837
    regions$upper_bound = as.numeric(regions$upper_bound)
838
    regions = regions[order(regions$lower_bound),]
839
    return(regions)
840
  }
841
}
842

    
843

    
844
crop_fuzzy = function(# Crop bound of regions according to region of interest bound
845
### The fucntion is no more necessary since we remove "big_cur" bug in translate_cur function.
846
tmp_fuzzy_nucs, ##<< the regiuons to be croped.
847
roi, ##<< The region of interest.
848
strain, ##<< The strain to consider.
849
config=NULL ##<< GLOBAL config variable
850
) {
851
  tr_roi = translate_cur(roi, strain, config = config)
852
  tr_roi_begin = min(tr_roi$begin, tr_roi$end)
853
  tr_roi_end = max(tr_roi$begin, tr_roi$end)
854
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) {
855
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin
856
  }
857
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) {
858
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin
859
  }
860
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) {
861
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end
862
  }
863
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) {
864
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end
865
  }
866
  tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,]
867
  return(tmp_fuzzy_nucs)
868
}
869

    
870
get_all_reads = function(# Retrieve Reads
871
### Retrieve reads for a given marker, combi, form.
872
marker, ##<< The marker to considere.
873
combi, ##<< The starin combination to considere.
874
form="wp", ##<< The nuc form to considere.
875
config=NULL ##<< GLOBAL config variable
876
) {
877
	all_reads = NULL
878
  for (manip in c("Mnase_Seq", marker)) {
879
    if (form == "unr") {
880
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="")
881
  		tmp_res = read.table(file=out_filename, header=TRUE)
882
			tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,]
883
      tmp_res$form = form
884
    } else if (form == "wp") {
885
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
886
  		tmp_res = read.table(file=out_filename, header=TRUE)
887
      tmp_res$form = form
888
    } else if (form == "wpunr") {
889
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
890
  		tmp_res = read.table(file=out_filename, header=TRUE)
891
      tmp_res$form = "wp"
892
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="")
893
  		tmp_res2 = read.table(file=out_filename, header=TRUE)
894
			tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,]
895
      tmp_res2$form = "unr"
896
      tmp_res = rbind(tmp_res, tmp_res2)
897
    }
898
		if (is.null(all_reads)) {
899
			all_reads = tmp_res[,c(1:9,length(tmp_res))]
900
		}
901
		tmp_res = tmp_res[,-c(1:9,length(tmp_res))]
902
		all_reads = cbind(all_reads, tmp_res)
903
  }
904
  return(all_reads)
905
}
906

    
907
get_design = function(# Build the design for DESeq
908
### This function build the design according sample properties.
909
marker, ##<< The marker to considere.
910
combi, ##<< The starin combination to considere.
911
all_samples ##<< Global list of samples.
912
) {
913
  off1 = 0
914
  off2 = 0
915
	manips = c("Mnase_Seq", marker)
916
	design_rownames = c()
917
	design_manip = c()
918
	design_strain = c()
919
  off2index = function(off) {
920
  	switch(toString(off),
921
  		"1"=c(0,1,1),
922
  	  "2"=c(1,0,1),
923
    	"3"=c(1,1,0),
924
  		c(1,1,1)
925
  		)
926
  }
927
	for (manip in manips) {
928
		tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) &  all_samples$marker == manip), ]
929
		tmp_samples = tmp_samples[order(tmp_samples$strain), ]
930
		if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) {
931
			tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,]
932
		} else {
933
			if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) {
934
				tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,]
935
			}
936
		}
937
		design_manip = c(design_manip, rep(manip, length(tmp_samples$id)))
938
		for (strain in combi) {
939
			cols = apply(t(tmp_samples[ (tmp_samples$strain == strain &  tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")})
940
			design_strain = c(design_strain, rep(strain, length(cols)))
941
			design_rownames = c(design_rownames, cols)
942
		}
943
	}
944
	snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain)
945
	return(snep_design)
946
}
947

    
948
plot_dist_samples = function(# Plot the distribution of reads.
949
### This fuxntion use the DESeq nomalization feature to compare qualitatively the distribution.
950
strain, ##<< The strain to considere.
951
marker, ##<< The marker to considere.
952
res, ##<< Data
953
all_samples, ##<< Global list of samples.
954
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot.
955
) {
956
	cols = apply(t(all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")})
957
	snepCountTable = res[,cols]
958
	snepDesign = data.frame(
959
		row.names = cols,
960
		manip = rep(marker, length(cols)),
961
		strain = rep(strain, length(cols))
962
		)
963
	cdsFull = newCountDataSet(snepCountTable, snepDesign)
964
	sizeFactors = estimateSizeFactors(cdsFull)
965
	# print(sizeFactors[[1]])
966
	sample_ids = all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id
967
	if (NEWPLOT) {
968
		plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker))
969
		NEWPLOT = FALSE
970
	}
971
	for (it in 1:length(sample_ids)) {
972
		sample_id = sample_ids[it]
973
		lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it)
974
	}
975
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
976
}
977

    
978
analyse_design = function(# Launch DESeq methods.
979
### This function is based on DESeq example. It normalizes data, fit data to GLM model with and without interaction term and compares the two models.
980
snep_design, ##<< The design to consider.
981
reads ##<< The data to consider.
982
) {
983
	snep_count_table = reads[, rownames(snep_design)]
984
	cdsFull = newCountDataSet(snep_count_table, snep_design)
985
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
986
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
987

    
988
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
989
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
990

    
991
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
992
	return(list(fit1, fit0, snep_design, pvalsGLM))
993
}
994

    
995

    
996

    
997

    
998

    
999

    
1000

    
1001

    
1002
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1003
### This function uses
1004
marker, ##<< The marker involved.
1005
combi, ##<< The strain combination involved.
1006
form, ##<< the nuc form involved.
1007
all_samples, ##<< Global list of samples.
1008
FDR = 0.0001, ## the specific False Discover Rate
1009
config=NULL ##<< GLOBAL config variable
1010
) {
1011
  # PRETREAT
1012
  snep_design = get_design(marker, combi, all_samples)
1013
  reads = get_all_reads(marker, combi, form, config=config)
1014
  # RUN ANALYSE
1015
  tmp_analyse = analyse_design(snep_design, reads)
1016
  # RESULTS
1017
	fit1 = tmp_analyse[[1]]
1018
	fit0 = tmp_analyse[[2]]
1019
  k = names(fit1)
1020
  reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1021
  reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1022
  reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1023
	reads$pvalsGLM = signif(tmp_analyse[[4]], 5)
1024
	snep_design = tmp_analyse[[3]]
1025
  # print(snep_design)
1026
	thres = FDR(reads$pvalsGLM, FDR)
1027
	reads$snep_index = reads$pvalsGLM < thres
1028
	print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = ""))
1029
  return(reads)
1030
  },  ex=function(){
1031
    marker = "H3K4me1"
1032
    combi = c("BY", "YJM")
1033
    form = "wpunr" # "wp" | "unr" | "wpunr"
1034
    # foo = get_sneps(marker, combi, form)
1035
    # foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
1036
})
1037

    
1038

    
1039

    
1040

    
1041

    
1042

    
1043

    
1044

    
1045

    
1046

    
1047

    
1048

    
1049

    
1050

    
1051

    
1052

    
1053

    
1054

    
1055

    
1056

    
1057

    
1058

    
1059

    
1060

    
1061
ROM2ARAB = function(# Roman to Arabic pair list.
1062
### Utility to convert Roman numbers into Arabic numbers
1063
){list(
1064
  "I" = 1,
1065
  "II" = 2,
1066
  "III" = 3,
1067
  "IV" = 4,
1068
  "V" = 5,
1069
  "VI" = 6,
1070
  "VII" = 7,
1071
  "VIII" = 8,
1072
  "IX" = 9,
1073
  "X" = 10,
1074
  "XI" = 11,
1075
  "XII" = 12,
1076
  "XIII" = 13,
1077
  "XIV" = 14,
1078
  "XV" = 15,
1079
  "XVI" = 16,
1080
  "XVII" = 17,
1081
  "XVIII" = 18,
1082
  "XIX" = 19,
1083
  "XX" = 20
1084
)}
1085

    
1086
switch_pairlist = structure(function(# Switch a pairlist
1087
### Take a pairlist key:value and return the switched pairlist value:key.
1088
l ##<< The pairlist to switch.
1089
) {
1090
	ret = list()
1091
	for (name in names(l)) {
1092
		ret[[as.character(l[[name]])]] = name
1093
	}
1094
	ret
1095
### The switched pairlist.
1096
}, ex=function(){
1097
	l = list(key1 = "value1", key2 = "value2")
1098
	print(switch_pairlist(l))
1099
})
1100

    
1101
ARAB2ROM = function(# Arabic to Roman pair list.
1102
### Utility to convert Arabic numbers to Roman numbers
1103
){switch_pairlist(ROM2ARAB())}
1104

    
1105

    
1106

    
1107

    
1108
c2c_extraction = function(# Extract a sub part of the corresponding c2c file
1109
### This fonction allows to access to a specific part of the c2c file.
1110
strain1, ##<< the key strain
1111
strain2, ##<< the target strain
1112
chr=NULL, ##<< if defined, the c2c will be filtered according to the chromosome value
1113
lower_bound=NULL, ##<< if defined, the c2c will be filtered for part of the genome upper than lower_bound
1114
upper_bound=NULL, ##<< if defined, the c2c will be filtered for part of the genome lower than upper_bound
1115
config=NULL##<<  GLOBAL config variable
1116
) {
1117
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1118
	# Launch c2c file
1119
	if (reverse) {
1120
		c2c_filename = config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]]
1121
	} else {
1122
		c2c_filename = config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]]
1123
	}
1124
	c2c = get_content(c2c_filename, "table", stringsAsFactors=FALSE)
1125
  # Filtering unagapped
1126
  c2c = c2c[c2c$V6=="-",]
1127
	# Reverse
1128
	if (reverse) {
1129
		tmp_col = c2c$V1
1130
		c2c$V1 = c2c$V7
1131
		c2c$V7 = tmp_col
1132
		tmp_col = c2c$V2
1133
		c2c$V2 = c2c$V9
1134
		c2c$V9 = tmp_col
1135
		tmp_col = c2c$V3
1136
		c2c$V3 = c2c$V10
1137
		c2c$V10 = tmp_col
1138
	}
1139
  if (!is.null(chr)) {
1140
  	if (strain1 == "BY") {
1141
  		chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="")
1142
  	} else if (strain1 == "RM") {
1143
  	  chro_1 = paste("supercontig_1.",chr,sep="")
1144
  	} else if (strain1 == "YJM") {
1145
  	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]]
1146
  	}
1147
  	c2c = c2c[c2c$V1 == chro_1,]
1148
    if (!is.null(lower_bound)) {
1149
      if (length(c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3,c("V2", "V3") ] = lower_bound}
1150
      if (length(c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2,c("V2", "V3") ] = lower_bound}      
1151
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1152
    }
1153
    if (!is.null(upper_bound)) {
1154
      if (length(c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, c("V2", "V3")] = upper_bound}
1155
      if (length(c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, c("V2", "V3")] = upper_bound}      
1156
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1157
    }
1158
  }
1159
  return(c2c)
1160
# It retruns the appropriate c2c file part.
1161
}
1162

    
1163
translate_cur = structure(function(# Translate coords of a genome region.
1164
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1165
roi, ##<< Original genome region of interest.
1166
strain2, ##<< The strain in wich you want the genome region of interest.
1167
config=NULL, ##<< GLOBAL config variable
1168
big_cur=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1169
) {
1170
	strain1 = roi$strain_ref  
1171
  # Do something or nothing?
1172
  if (is.null(config$NEUTRAL_TRANSLATE_CUR)) {
1173
    config$NEUTRAL_TRANSLATE_CUR = FALSE
1174
  }
1175
	if (strain1 == strain2 | config$NEUTRAL_TRANSLATE_CUR) {
1176
    roi$strain_ref = strain2
1177
    roi$length = roi$end - roi$begin + sign(roi$end - roi$begin)
1178
		return(roi)
1179
	}
1180
  
1181
	# Extract c2c file
1182
	if (!is.null(big_cur)) {
1183
  	# Dealing with big_cur
1184
		if (roi$strain_ref != big_cur$strain_ref) {
1185
      big_cur = translate_cur(big_cur, roi$strain_ref, config=config)
1186
    }
1187
    if (big_cur$end < big_cur$begin) {
1188
      tmp_var = big_cur$begin
1189
      big_cur$begin = big_cur$end
1190
      big_cur$end = tmp_var
1191
      big_cur$length = big_cur$end - big_cur$begin + 1
1192
    }
1193
    if (big_cur$chr!=roi$chr | roi$end > big_cur$end | roi$end < big_cur$begin | roi$begin > big_cur$end | roi$begin < big_cur$begin) {
1194
      print("WARNING! Trying to translate a roi not included in a big_cur.")
1195
      return(NULL)
1196
    }
1197
  	c2c = c2c_extraction(strain1, strain2, big_cur$chr, big_cur$begin, big_cur$end, config=config)
1198
  } else {
1199
    # No big_cur
1200
  	c2c = c2c_extraction(strain1, strain2, roi$chr, config=config)    
1201
  }
1202
  
1203
  #	Convert initial roi$chr into c2c format
1204
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1205
	begin_1 = roi$begin
1206
  end_1 = roi$end
1207
  if (reverse) {
1208
  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1209
    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1210
	} else {
1211
		tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1212
	  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1213
	}
1214

    
1215
	# Never happend conditions ...
1216
	{
1217
		if (length(tmptransfostart$V8) == 0) {
1218
			# begin_1 is between to lines: shift begin_1 to the start of 2nd line.
1219
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1220
  			tmp_c2c = c2c[c2c$V2>=begin_1,]
1221
  			begin_1 = min(tmp_c2c$V2)
1222
      } else {
1223
  			tmp_c2c = c2c[c2c$V3>=begin_1,]
1224
  			begin_1 = min(tmp_c2c$V3)
1225
      }
1226
			if (reverse) {
1227
		  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1228
			} else {
1229
				tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1230
			}
1231
			if (length(tmptransfostart$V8) == 0) {
1232
				if (!is.null(big_cur)) {
1233
					return(NULL)
1234
					tmptransfostart = c2c[c2c$V3>=big_cur$begin & c2c$V2<=big_cur$begin,]
1235
				} else {
1236
					print(tmptransfostart)
1237
					print(tmptransfostop)
1238
					stop("Never happend condition 1.")
1239
				}
1240
			}
1241
		}
1242
		if (length(tmptransfostop$V8) == 0) {
1243
			# end_1 is between to lines: shift end_1 to the end of 2nd line.
1244
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1245
  			tmp_c2c = c2c[c2c$V3<=end_1,]
1246
  			end_1 = max(tmp_c2c$V3)
1247
      } else {
1248
  			tmp_c2c = c2c[c2c$V2<=end_1,]
1249
  			end_1 = max(tmp_c2c$V2)
1250
      }
1251
			if (reverse) {
1252
		    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1253
			} else {
1254
			  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1255
			}
1256
			if (length(tmptransfostop$V8) == 0) {
1257
				if (!is.null(big_cur)) {
1258
					return(NULL)
1259
				  tmptransfostop = c2c[c2c$V3>=big_cur$end & c2c$V2<=big_cur$end,]
1260
				} else {
1261
					print(tmptransfostart)
1262
					print(tmptransfostop)
1263
					stop("Never happend condition 2.")
1264
				}
1265
			}
1266
		}
1267
		if (length(tmptransfostart$V8) != 1) {
1268
			# print("many start")
1269
			tmptransfostart = tmptransfostart[tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,]
1270
			if (length(tmptransfostart$V8) != 1) {
1271
				print(tmptransfostart)
1272
				print(tmptransfostop)
1273
  			stop("Never happend condition 3.")
1274
			}
1275
		}
1276
		if (length(tmptransfostop$V8) != 1) {
1277
			# print("many stop")
1278
		  tmptransfostop = tmptransfostop[tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,]
1279
			if (length(tmptransfostop$V8) != 1) {
1280
				print(tmptransfostart)
1281
				print(tmptransfostop)
1282
  			stop("Never happend condition 4.")
1283
			}
1284
		}
1285
		if (tmptransfostart$V7 != tmptransfostop$V7) {
1286
			print(tmptransfostart)
1287
			print(tmptransfostop)
1288
 			stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.")
1289
		}
1290
	}
1291
  
1292
  # Deal with strand
1293
  if (tmptransfostart$V8 == 1) {
1294
    begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2)
1295
    end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2)
1296
  } else {
1297
    begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2)
1298
    end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2)
1299
  }
1300
	# Build returned roi
1301
	roi$strain_ref = strain2
1302
	if (roi$strain_ref == "BY") {
1303
		roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]]
1304
	} else {
1305
		roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]]
1306
	}
1307
  roi$begin = begin_2
1308
  roi$end = end_2
1309
	if (sign(roi$end - roi$begin) == 0) {
1310
		roi$length = 1
1311
	} else {
1312
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1
1313
	}
1314
  return(roi)
1315
}, ex=function(){
1316
	# Define new translate_cur function...
1317
	translate_cur = function(roi, strain2, config) {
1318
		strain1 = roi$strain_ref
1319
		if (strain1 == strain2) {
1320
			return(roi)
1321
		} else {
1322
		  stop("Here is my new translate_cur function...")
1323
		}
1324
	}
1325
	# Binding it by uncomment follwing lines.
1326
	# unlockBinding("translate_cur", as.environment("package:nm"))
1327
	# unlockBinding("translate_cur", getNamespace("nm"))
1328
	# assign("translate_cur", translate_cur, "package:nm")
1329
	# assign("translate_cur", translate_cur, getNamespace("nm"))
1330
	# lockBinding("translate_cur", getNamespace("nm"))
1331
	# lockBinding("translate_cur", as.environment("package:nm"))
1332
})
1333

    
1334

    
1335
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
1336
### CURs are regions that can be aligned between the genomes
1337
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR
1338
min_cur_width = 4000, ##<< The minimum width of a CUR
1339
config = NULL ##<< GLOBAL config variable
1340
) {
1341

    
1342
  check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) {
1343
    c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1344
    check_homogeneity(c2c)
1345
  	if (length(c2c[,1]) == 0 ) {
1346
      stop("WARNING! checking overlapping for a region corresponding to an empty c2c.")
1347
    } else {
1348
  		lower_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; min(l$V2, l$V3)})
1349
  		upper_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; max(l$V2, l$V3)})
1350
  		tmp_index = order(lower_bounds)
1351
      lower_bounds = lower_bounds[tmp_index]
1352
      upper_bounds = upper_bounds[tmp_index]
1353
      tmp_diff = lower_bounds[-1] - upper_bounds[-length(upper_bounds)]
1354
      ov_index = which(tmp_diff < 0)
1355
      if(length(ov_index < 0) !=0 ) {
1356
        ov_index = ov_index[1]
1357
        print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ", c2c[1,]$V1, sep=""))
1358
        c2c_corrupted = c2c[tmp_index,][c(ov_index, ov_index + 1),]
1359
        print(c2c_corrupted)
1360
        return(list(lower_bounds[ov_index+1] - 1, upper_bounds[ov_index] + 1))
1361
      }
1362
      return(NULL)
1363
    }
1364
  }
1365

    
1366
  check_homogeneity = function(sub_c2c) {
1367
    tmp_signs = sign(sub_c2c$V2 - sub_c2c$V3)
1368
    tmp_signs = tmp_signs[tmp_signs != 0]
1369
  	if (sum(tmp_signs[1]  != tmp_signs)) {
1370
  		print(paste("*************** ERROR, non homogenous region (sign)! ********************"))
1371
      print(tmp_signs)
1372
  	}
1373
    tmp_signs2 = sign(sub_c2c$V9 - sub_c2c$V10)
1374
    tmp_signs2 = tmp_signs2[tmp_signs2 != 0]
1375
  	if (sum(tmp_signs2[1]  != tmp_signs2)) {
1376
  		print(paste("*************** ERROR, non homogenous region (sign2)! ********************"))
1377
      print(tmp_signs2)
1378
  	}
1379
  	if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) {
1380
  		print("*************** ERROR, non homogenous region chrs or V8! ********************")
1381
  	}
1382
  }
1383

    
1384
  test_and_squeeze_rois = function(foo, config=NULL) {
1385
    is_it_ok = function(list1, list2) {
1386
      bar = cbind(list1$begin, list2$begin, abs(list1$begin - list2$begin), list1$end, list2$end, abs(list1$end - list2$end), list1$length, list2$length, abs(list1$length - list2$length))
1387
      ok = length(bar[bar[,3] != 0 | bar[,6] != 0, ]) == 0
1388
      if (!ok) {
1389
        print(bar[bar[,3] != 0 | bar[,6] != 0, ])
1390
      }
1391
      return (ok)
1392
    }
1393
    squeeze_rois = function(list1, list2) {
1394
      rois = apply(t(1:nrow(list1)), 2, function(i){
1395
        roi = list1[i,]
1396
        roi2 = list2[i,]
1397
        roi$begin = max(roi$begin, roi2$begin)
1398
        roi$end = min(roi$end, roi2$end)
1399
        roi$length =  roi$end - roi$begin + 1
1400
        return(roi)
1401
      })
1402
      return(do.call(rbind, rois))
1403
    }
1404
    # foo_orig = compute_inter_all_strain_curs2(config=config)
1405
    # foo = foo_orig
1406
    STOP = FALSE
1407
    nb_round = 0
1408
    while(!STOP) {
1409
      nb_round = nb_round + 1
1410
      print(paste("2-2 round #", nb_round, sep=""))
1411
      fooby = translate_curs(foo, "BY", config=config)
1412
      fooyjm = translate_curs(foo, "YJM", config=config)
1413
      fooyjmby = translate_curs(fooyjm, "BY", config=config)
1414
      if (!is_it_ok(fooby, fooyjmby)) {
1415
        print("case 1")
1416
        foo = squeeze_rois(fooby, fooyjmby)
1417
    		next
1418
      }
1419
      foorm = translate_curs(foo, "RM", config=config)
1420
      foormby = translate_curs(foorm, "BY", config=config)
1421
      if (!is_it_ok(fooby, foormby)) {
1422
        print("case 2")
1423
        foo = squeeze_rois(fooby, foormby)
1424
    		next
1425
      }
1426
      fooyjmrm = translate_curs(fooyjm, "RM", config=config)
1427
      fooyjmrmyjm = translate_curs(fooyjmrm, "YJM", config=config)
1428
      if (!is_it_ok(fooyjm, fooyjmrmyjm)) {
1429
        print("case 3")
1430
        foo = squeeze_rois(fooyjm, fooyjmrmyjm)
1431
        next
1432
      }
1433
      foormyjm = translate_curs(foorm, "YJM", config=config)
1434
      foormyjmrm = translate_curs(foormyjm, "RM", config=config)
1435
      if (!is_it_ok(foorm, foormyjmrm)) {
1436
        print("case 4")
1437
        foo = squeeze_rois(foorm, foormyjmrm)
1438
        next
1439
      }
1440
      foo = translate_curs(foo, "BY", config=config)
1441
      STOP = TRUE
1442
    }
1443
    STOP = FALSE
1444
    nb_round = 0
1445
    while(!STOP) {
1446
      nb_round = nb_round + 1
1447
      print(paste("3-3 round #", nb_round, sep=""))
1448
      fooby = translate_curs(foo, "BY", config=config)
1449
      foobyrm = translate_curs(fooby, "RM", config=config)
1450
      foobyrmyjm = translate_curs(foobyrm, "YJM", config=config)
1451
      foobyrmyjmby = translate_curs(foobyrmyjm, "BY", config=config)
1452
      if (!is_it_ok(fooby, foobyrmyjmby)) {
1453
        print("case 1")
1454
        foo = squeeze_rois(fooby, foobyrmyjmby)
1455
      }
1456
      fooby = translate_curs(foo, "BY", config=config)
1457
      foobyyjm = translate_curs(fooby, "YJM", config=config)
1458
      foobyyjmrm = translate_curs(foobyyjm, "RM", config=config)
1459
      foobyyjmrmby = translate_curs(foobyyjmrm, "BY", config=config)
1460
      if (!is_it_ok(fooby, foobyyjmrmby)) {
1461
        print("case 2")
1462
        foo = squeeze_rois(fooby, foobyyjmrmby)
1463
        next
1464
      }
1465
      foo = translate_curs(foo, "BY", config=config)
1466
      STOP = TRUE
1467
    }
1468
    print("end")
1469
    return(foo)
1470
  }
1471

    
1472
  get_inter_strain_rois = function(strain1, strain2, diff_allowed = 30, min_cur_width = 200, config=NULL) {
1473
    c2c = c2c_extraction(strain1, strain2, config=config)
1474
    # computing diffs
1475
    diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)]
1476
    diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)]
1477
    # Filtering
1478
  	indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed)
1479
  	indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1))
1480
    rois = apply(t(1:length(indexes_start)), 2, function(i) {
1481
      if ( i %% 20 == 1) print(paste(i, "/", length(indexes_start)))
1482
      returned_rois = NULL
1483
  		start = indexes_start[i]
1484
  		stop = indexes_stop[i]
1485
  		sub_c2c = c2c[start:stop,]
1486
      check_homogeneity(sub_c2c)
1487
  		if (strain1 == "BY") {
1488
  			chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]]
1489
  		} else {
1490
  			chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]]
1491
  		}
1492
  		roi = list(chr=chr, begin=min(c(sub_c2c$V2,sub_c2c$V3)), end=max(c(sub_c2c$V2,sub_c2c$V3)), strain_ref=strain1)
1493
  		roi$length = roi$end - roi$begin + 1
1494
  		if (roi$length >= min_cur_width) {
1495
  			lower_bound = roi$begin
1496
  			upper_bound = roi$end
1497
        check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1498
        while(!is.null(check)) {
1499
          # print(check)
1500
      		roi1 = roi
1501
          roi1$end = check[[1]]
1502
      		roi1$length = roi1$end - roi1$begin + 1
1503
      		if (roi1$length >= min_cur_width) {
1504
            returned_rois = dfadd(returned_rois,roi1)
1505
          }
1506
          roi$begin = check[[2]]
1507
      		roi$length = roi$end - roi$begin + 1
1508
      		if (roi$length >= min_cur_width) {
1509
      			lower_bound = min(roi$begin, roi$end)
1510
      			upper_bound = max(roi$begin, roi$end)
1511
            check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1512
          } else {
1513
            check = NULL
1514
            roi = NULL
1515
          }
1516
        }
1517
        returned_rois = dfadd(returned_rois,roi)
1518
  	  }
1519
    })
1520
    rois = do.call(rbind,rois)
1521
    rois = rois[order(as.numeric(rois$chr), rois$begin), ]
1522
  	return(rois)
1523
  }
1524

    
1525
  translate_curs = function(rois, target_strain, config) {
1526
    tr_rois = apply(t(1:nrow(rois)), 2, function(i){
1527
      roi = rois[i,]
1528
      tr_roi = translate_cur(roi, target_strain, config=config)  
1529
      tmp_begin = min(tr_roi$begin, tr_roi$end)
1530
      tmp_end = max(tr_roi$begin, tr_roi$end)
1531
      tr_roi$begin = tmp_begin
1532
      tr_roi$end = tmp_end
1533
      tr_roi$length =  tr_roi$end - tr_roi$begin + 1
1534
      return(tr_roi)
1535
    })
1536
    tr_rois = do.call(rbind, tr_rois)
1537
    return(tr_rois)
1538
  }
1539

    
1540
  combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM"))
1541
  rois = list()
1542
  for (combi in combis) {
1543
    strain1 = combi[1]
1544
    strain2 = combi[2]
1545
    print(paste(strain1, strain2))
1546
    rois_fwd = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1547
    strain1 = combi[2]
1548
    strain2 = combi[1]
1549
    print(paste(strain1, strain2))
1550
    rois_rev = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1551
    tr_rois_rev = translate_curs(rois_rev, combi[1], config)      
1552
    region1 = rois_fwd
1553
    region2 = tr_rois_rev
1554
    rois[[paste(combi[1], combi[2], sep="_")]] = intersect_region(rois_fwd, tr_rois_rev)
1555
  }
1556
  reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]])
1557
  reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ]
1558
  tr_reducted_1_rois = translate_curs(reducted_1_rois, "RM", config)  
1559
  reducted_2_rois = intersect_region(tr_reducted_1_rois, rois[["RM_YJM"]])
1560
  reducted_2_rois = reducted_2_rois[reducted_2_rois$length >= min_cur_width, ]
1561
  reducted_rois = translate_curs(reducted_2_rois, "BY", config)  
1562
  reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ]
1563
  squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config)
1564
  return (squeezed_rois)
1565
}
1566

    
1567
intersect_region = function(# Returns the intersection of 2 list on regions.
1568
### This function...
1569
region1, ##<< Original regions.
1570
region2 ##<< Regions to intersect.
1571
) {
1572
  intersection = apply(t(1:nrow(region1)), 2, function(i) {
1573
    roi1 = region1[i, ]
1574
    sub_regions2 = region2[region2$chr == roi1$chr, ]
1575
    sub_regions2 = sub_regions2[roi1$begin <= sub_regions2$begin & sub_regions2$begin <= roi1$end |
1576
                                roi1$begin <= sub_regions2$end & sub_regions2$end <= roi1$end |
1577
                                sub_regions2$begin < roi1$begin  & roi1$end < sub_regions2$end
1578
                                , ]
1579
    if (nrow(sub_regions2) == 0) {
1580
      print("removing a region")
1581
      return(NULL)
1582
    } else if (nrow(sub_regions2) > 1) {
1583
      print("more than one region in intersect_region")          
1584
      return(do.call(rbind, apply(t(1:nrow(sub_regions2)), 2, function(i) {intersect_region(roi1, sub_regions2[i,])})))
1585
    } else {
1586
      roi2 = sub_regions2[1,]
1587
      if (roi1$begin < roi2$begin) {
1588
        print("not the same begin")
1589
        roi1$begin = roi2$begin
1590
        roi1$length =  roi1$end - roi1$begin + 1
1591
      }
1592
      if (roi1$end > roi2$end) {
1593
        print("not the same end")
1594
        roi1$end = roi2$end
1595
        roi1$length =  roi1$end - roi1$begin + 1
1596
      }
1597
      return(roi1)
1598
    }         
1599
  })
1600
  return(do.call(rbind,intersection))
1601
}
1602

    
1603
build_replicates = structure(function(# Stage replicates data
1604
### This function loads in memory the data corresponding to the given experiments.
1605
expe, ##<< a list of vectors corresponding to replicates.
1606
roi, ##<< the region that we are interested in.
1607
only_fetch=FALSE, ##<< filter or not inputs.
1608
get_genome=FALSE,##<< Load or not corresponding genome.
1609
all_samples, ##<< Global list of samples.
1610
config=NULL ##<< GLOBAL config variable.
1611
) {
1612
  build_samples = function(samples_ids, roi, only_fetch=FALSE, get_genome=TRUE, get_ouputs=TRUE, all_samples) {
1613
  	samples=list()
1614
  	for (i in samples_ids) {
1615
  		sample = as.list(all_samples[all_samples$id==i,])
1616
      sample$orig_roi = roi
1617
      sample$roi = translate_cur(roi, sample$strain, config = config)
1618
  		if (get_genome) {
1619
  			# Get Genome
1620
  			fasta_ref_filename = config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]]
1621
  			sample$roi$genome = get_content(fasta_ref_filename, "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1622
  		}
1623
  		# Get inputs
1624
  		sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1625
  		sample$inputs = get_content(sample_inputs_filename, "table", stringsAsFactors=FALSE)
1626
  		sample$total_reads = sum(sample$inputs[,4])
1627
  		if (!only_fetch) {
1628
  		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300, filter_for_coverage=TRUE)
1629
  	  }
1630
  	  # Get TF outputs for Mnase_Seq samples
1631
  		if (sample$marker == "Mnase_Seq" & get_ouputs) {
1632
  			sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")
1633
  			sample$outputs = get_content(sample_outputs_filename, "table", header=TRUE, sep="\t")
1634
  			if (!only_fetch) {
1635
  	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1636
    		}
1637
  		}
1638
  		samples[[length(samples) + 1]] = sample
1639
  	}
1640
  	return(samples)
1641
  }
1642
	replicates = list()
1643
	for(samples_ids in expe) {
1644
		samples = build_samples(samples_ids, roi, only_fetch=only_fetch, get_genome=get_genome, all_samples=all_samples)
1645
		replicates[[length(replicates) + 1]] = samples
1646
	}
1647
	return(replicates)
1648
  }, ex = function() {
1649
    # library(rjson)
1650
    # library(nucleominer)
1651
    #
1652
    # # Read config file
1653
    # json_conf_file = "nucleominer_config.json"
1654
    # config = fromJSON(paste(readLines(json_conf_file), collapse=""))
1655
    # # Read sample file
1656
    # all_samples = get_content(config$CSV_SAMPLE_FILE, "cvs", sep=";", head=TRUE, stringsAsFactors=FALSE)
1657
    # # here are the sample ids in a list
1658
    # expes = list(c(1))
1659
    # # here is the region that we wnt to see the coverage
1660
    # cur = list(chr="8", begin=472000, end=474000, strain_ref="BY")
1661
    # # it displays the corverage
1662
    # replicates = build_replicates(expes, cur, all_samples=all_samples, config=config)
1663
    # out = watch_samples(replicates, config$READ_LENGTH,
1664
    #       plot_coverage = TRUE,
1665
    #       plot_squared_reads = FALSE,
1666
    #       plot_ref_genome = FALSE,
1667
    #       plot_arrow_raw_reads = FALSE,
1668
    #       plot_arrow_nuc_reads = FALSE,
1669
    #       plot_gaussian_reads = FALSE,
1670
    #       plot_gaussian_unified_reads = FALSE,
1671
    #       plot_ellipse_nucs = FALSE,
1672
    #       plot_wp_nucs = FALSE,
1673
    #       plot_wp_nuc_model = FALSE,
1674
    #       plot_common_nucs = FALSE,
1675
    #       height = 50)
1676
  })
1677

    
1678

    
1679

    
1680

    
1681

    
1682

    
1683

    
1684

    
1685

    
1686

    
1687

    
1688

    
1689

    
1690

    
1691

    
1692

    
1693

    
1694

    
1695

    
1696

    
1697

    
1698

    
1699

    
1700

    
1701

    
1702

    
1703

    
1704

    
1705

    
1706

    
1707

    
1708

    
1709

    
1710

    
1711

    
1712

    
1713

    
1714

    
1715

    
1716

    
1717

    
1718

    
1719

    
1720

    
1721

    
1722

    
1723

    
1724

    
1725

    
1726

    
1727

    
1728

    
1729

    
1730

    
1731

    
1732

    
1733

    
1734

    
1735

    
1736

    
1737

    
1738

    
1739

    
1740

    
1741

    
1742

    
1743

    
1744

    
1745

    
1746

    
1747

    
1748

    
1749

    
1750

    
1751

    
1752

    
1753

    
1754

    
1755

    
1756

    
1757

    
1758

    
1759

    
1760
watch_samples = function(# Watching analysis of samples
1761
### This function allows to view analysis for a particuler region of the genome.
1762
replicates, ##<< replicates under the form...
1763
read_length, ##<< length of the reads
1764
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome.
1765
plot_arrow_raw_reads = TRUE,  ##<< Plot (or not) arrows for raw reads.
1766
plot_arrow_nuc_reads = TRUE,  ##<< Plot (or not) arrows for reads aasiocied to a nucleosome.
1767
plot_squared_reads = TRUE,  ##<< Plot (or not) reads in the square fashion.
1768
plot_coverage = FALSE,  ##<< Plot (or not) reads in the covergae fashion. fashion.
1769
plot_gaussian_reads = TRUE,  ##<< Plot (or not) gaussian model of a F anf R reads.
1770
plot_gaussian_unified_reads = TRUE,  ##<< Plot (or not) gaussian model of a nuc.
1771
plot_ellipse_nucs = TRUE,  ##<< Plot (or not) ellipse for a nuc.
1772
change_col = TRUE, ##<< Change the color of each nucleosome.
1773
plot_wp_nucs = TRUE,  ##<< Plot (or not) cluster of nucs
1774
plot_fuzzy_nucs = TRUE,  ##<< Plot (or not) cluster of fuzzy
1775
plot_wp_nuc_model = TRUE,  ##<< Plot (or not) gaussian model for a cluster of nucs
1776
plot_common_nucs = FALSE,  ##<< Plot (or not) aligned reads.
1777
plot_common_unrs = FALSE,  ##<< Plot (or not) unaligned nucleosomal refgions (UNRs).
1778
plot_wp_nucs_4_nonmnase = FALSE,  ##<< Plot (or not) clusters for non inputs samples.
1779
plot_chain = FALSE,  ##<< Plot (or not) clusterised nuceosomes between mnase samples.
1780
plot_sample_id = FALSE, ##<<  Plot (or not) the sample id for each sample.
1781
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed.
1782
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed.
1783
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis.
1784
main=NULL, ##<< main title of the produced plot
1785
xlab=NULL, ##<< xlab of the produced plot
1786
ylab="#reads (per million reads)", ##<< ylab of the produced plot
1787
config=NULL ##<< GLOBAL config variable
1788
){
1789
  returned_list = list()
1790
  # Computing global display parameters
1791
  if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) {
1792
	  x_min_glo = replicates[[1]][[1]]$roi[["begin"]]
1793
	  x_max_glo = replicates[[1]][[1]]$roi[["end"]]
1794
  } else {
1795
	  x_min_glo = - replicates[[1]][[1]]$roi[["begin"]]
1796
	  x_max_glo = - replicates[[1]][[1]]$roi[["end"]]
1797
  }
1798
	base_glo = 0
1799
	nb_rank_glo = 0
1800
  for (samples in replicates) {
1801
  	nb_rank_glo = nb_rank_glo + length(samples)
1802
  }
1803
	ylim_glo = c(base_glo, base_glo + height * nb_rank_glo)
1804
	y_min_glo = min(ylim_glo)
1805
	y_max_glo = max(ylim_glo)
1806
  delta_y_glo = y_max_glo - y_min_glo
1807
  # Plot main frame
1808
  if (is.null(xlab)) {
1809
    xlab = paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr)
1810
  }
1811
  plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab=ylab, xlab=xlab, main=main )
1812
  axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2))
1813
  # Go
1814
	replicates_wp_nucs = list()
1815
  wp_maps = list()
1816
  fuzzy_maps = list()
1817
  for (replicate_rank in 1:length(replicates)) {
1818
		# Computing replicate parameters
1819
		nb_rank = length(samples)
1820
		base = (replicate_rank-1) * height * nb_rank
1821
		ylim = c(base, base + height * nb_rank)
1822
		y_min = min(ylim)
1823
		y_max = max(ylim)
1824
	  delta_y = y_max - y_min
1825
		samples = replicates[[replicate_rank]]
1826
		for (sample_rank in 1:length(samples)) {
1827
			# computing sample parameters
1828
			sample = samples[[sample_rank]]
1829
			y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank
1830
      if (plot_sample_id) {
1831
  			text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep=""))
1832
      }
1833

    
1834
		  if (sample$roi[["begin"]] < sample$roi[["end"]]) {
1835
			  x_min = sample$roi[["begin"]]
1836
			  x_max = sample$roi[["end"]]
1837
		  } else {
1838
			  x_min = - sample$roi[["begin"]]
1839
			  x_max = - sample$roi[["end"]]
1840
		  }
1841
			shift = x_min_glo - x_min
1842
	    # Plot Genome seq
1843
			if (plot_ref_genome) {
1844
		  	text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier")
1845
		  }
1846
			# Plot reads
1847
			reads = sample$inputs
1848
			signs = sign_from_strand(reads[,3])
1849
			if (plot_arrow_raw_reads) {
1850
				arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev,
1851
				col=1, length=0.15/nb_rank)
1852
			}
1853
	    if (plot_squared_reads) {
1854
        # require(plotrix)
1855
				rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift,  y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0)
1856
			}
1857
	    if (plot_coverage) {
1858
        if (length(reads[,1]) != 0) {
1859
          step_h = sign(x_min) * signs * reads[,4]
1860
          step_b = sign(x_min) * reads[,2] + shift
1861
          step_e = sign(x_min) * (reads[,2] + signs * 150) + shift
1862
          steps_x = min(step_b, step_e):max(step_b, step_e)
1863
          steps_y = rep(0, length(steps_x))
1864
          for (i in 1:length(step_h)) {
1865
            steps_y[which(steps_x==min(step_b[i], step_e[i]))] =  steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i])
1866
            steps_y[which(steps_x==max(step_b[i], step_e[i]))] =  steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i])
1867
          }
1868
          tmp_index = which(steps_y != 0)
1869
          steps_x = steps_x[tmp_index]
1870
          steps_y = steps_y[tmp_index]
1871
          tmp_current_level = 0
1872
          for (i in 1:length(steps_y)) {
1873
            steps_y[i] = tmp_current_level + steps_y[i]
1874
            tmp_current_level = steps_y[i]
1875
          }
1876
          steps_y = c(0, steps_y)
1877
          steps_y = steps_y * 1000000/sample$total_reads
1878
        } else {
1879
          steps_y = c(0, 0, 0)
1880
          steps_x = c(x_min, x_max)
1881
        }
1882
        # print(steps_x)
1883
        # print(steps_y)
1884
        lines(stepfun(steps_x, steps_y + y_lev), pch="")
1885
        abline(y_lev,0)
1886
        returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y)
1887
			}
1888
			# Plot nucs
1889
	    if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) {
1890
				nucs = sample$outputs
1891
				if (length(nucs$center) > 0) {
1892
					col = 1
1893
		      for (i in 1:length(nucs$center)) {
1894
            if (change_col) {
1895
  						col = col + 1
1896
              } else {
1897
                col = "blue"
1898
              }
1899
		        nuc = nucs[i,]
1900
						involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width)
1901
				  	involved_signs = apply(t(involved_reads[,3]), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
1902
						total_involved_reads = sum(involved_reads[,4])
1903
						if (plot_arrow_nuc_reads ) {
1904
							arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev,
1905
							col=col, length=0.15/nb_rank)
1906
						}
1907
	          if (plot_gaussian_reads | plot_gaussian_unified_reads) {
1908
  						flatted_reads = flat_reads(involved_reads, nuc$width)
1909
	  					delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
1910
		  			}
1911
	          if (plot_gaussian_reads ) {
1912
							flatted_reads = flat_reads(involved_reads, nuc$width)
1913
							delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
1914
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) * height/5 + y_lev, col=col)
1915
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) * height/5 + y_lev, col=col)
1916
	          }
1917
	          if (plot_gaussian_unified_reads ) {
1918
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) * height/5 + y_lev, col=col, lty=1)
1919
	          }
1920
	          if (plot_ellipse_nucs) {
1921
				      # require(plotrix)
1922
	  	 				draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width * height/5, border=col)
1923
						}
1924
		      }
1925
		    } else {
1926
		      print("WARNING! No nucs to print.")
1927
		    }
1928
			}
1929
	  }
1930

    
1931
	  # Plot wp nucs
1932
		if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs | plot_chain)) {
1933
			if (samples[[1]]$marker == "Mnase_Seq") {
1934
				if (is.null(aggregated_intra_strain_nucs)) {
1935
	  			wp_nucs = aggregate_intra_strain_nucs(samples)[[1]]
1936
				} else {
1937
					wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]]
1938
				}
1939
		  } else {
1940
  			wp_nucs = replicates_wp_nucs[[replicate_rank-2]]
1941
		  }
1942
      if (plot_chain) {
1943
        tf_nucs = lapply(wp_nucs, function(nuc) {
1944
          bar = apply(t(nuc$nucs), 2, function(tmp_nuc){
1945
            tmp_nuc = tmp_nuc[[1]]
1946
            tmp_nuc$inputs = NULL
1947
            tmp_nuc$original_reads = NULL
1948
            tmp_nuc$wp = nuc$wp
1949
            # print(tmp_nuc)
1950
            return(tmp_nuc)
1951
          })
1952
          return(do.call(rbind, bar))
1953
        })
1954
        tf_nucs = data.frame(do.call(rbind, tf_nucs))
1955
        tmp_x = (unlist(tf_nucs$lower_bound) + unlist(tf_nucs$upper_bound)) / 2
1956
        tmp_y =  y_min + (unlist(tf_nucs$track) - 0.5) * delta_y/nb_rank
1957
        tmp_y_prev = tmp_y[-length(tmp_y)]
1958
        tmp_y_next = tmp_y[-1]
1959
        tmp_y_inter = (tmp_y_prev + tmp_y_next) / 2
1960
        tmp_track = unlist(tf_nucs$track)
1961
        tmp_track_prev = tmp_track[-length(tmp_track)]
1962
        tmp_track_next = tmp_track[-1]
1963
        # tmp_track_inter = signif(tmp_track_prev - tmp_track_next) * (abs(tmp_track_prev - tmp_track_next) > 1) * 25
1964
        if (is.null(config$TRACK_LLR_OFFSET)) {
1965
          config$TRACK_LLR_OFFSET = 0
1966
        }
1967
        tmp_track_inter = signif(tmp_track_prev - tmp_track_next) + config$TRACK_LLR_OFFSET * 25
1968
        tmp_x_prev = tmp_x[-length(tmp_x)]
1969
        tmp_x_next = tmp_x[-1]
1970
        need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50})
1971
        tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift
1972
        tmp_llr_inter =signif(unlist(tf_nucs$llr_score)[-1], 2)
1973
        new_tmp_x = c()
1974
        new_tmp_y = c()
1975
        index_odd = 1:length(tmp_x) * 2 - 1
1976
        index_even = (1:(length(tmp_x) - 1)) * 2
1977
        new_tmp_x[index_odd] = tmp_x
1978
        new_tmp_y[index_odd] = tmp_y
1979
        new_tmp_x[index_even] = tmp_x_inter
1980
        new_tmp_y[index_even] = tmp_y_inter
1981
        lines(new_tmp_x , new_tmp_y, lwd=2)
1982
        points(tmp_x, tmp_y, cex=4, pch=16, col="white")
1983
        points(tmp_x, tmp_y, cex=4, lwd=2)
1984
        text(tmp_x, tmp_y, 1:nrow(tf_nucs))
1985
        if (is.null(config$LEGEND_LLR_POS)) {
1986
          pos = 2
1987
        } else {
1988
          pos = config$LEGEND_LLR_POS
1989
        }
1990
        col_llr = sapply(tmp_llr_inter, function(llr){if (llr < 20 ) return("green") else return("red")})
1991
        text(tmp_x_inter, tmp_y_inter, tmp_llr_inter, cex=1.5, pos=pos, col=col_llr) 
1992
        
1993
      }
1994

    
1995
      if (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs ) {
1996
    		replicates_wp_nucs[[replicate_rank]] = wp_nucs
1997
        strain = samples[[1]]$strain
1998
        wp_maps[[strain]] = flat_aggregated_intra_strain_nucs(wp_nucs, "foo", nb_tracks)
1999
        fuzzy_maps[[strain]] = get_intra_strain_fuzzy(wp_maps[[strain]], as.list(samples[[1]]$roi), samples[[1]]$strain, config=config)
2000

    
2001
        if (plot_fuzzy_nucs) {
2002
          fuzzy_map = fuzzy_maps[[strain]]
2003
          if (!is.null(fuzzy_map)) {
2004
            if (nrow(fuzzy_map) > 0) {
2005
              rect(sign(x_min) * fuzzy_map$lower_bound + shift, y_min, sign(x_min) * fuzzy_map$upper_bound + shift, y_max, col=adjustcolor(3, alpha.f = 0.1), border=1)                    
2006
            }
2007
          }
2008
        } 
2009

    
2010
        if (plot_wp_nucs) {
2011
    			for (wp_nuc in wp_nucs) {
2012
    				if (wp_nuc$wp){
2013
    					rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1)
2014
    					if (plot_wp_nuc_model) {
2015
      					all_original_reads = c()
2016
      					for(initial_nuc in wp_nuc$nucs) {
2017
      						all_original_reads = c(all_original_reads, initial_nuc$original_reads)
2018
      					}
2019
      					delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound
2020
    					  lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) * height/5 + y_min, col=1)
2021
    				  }
2022
  				  }
2023
  				}
2024
  			}
2025
      }
2026
		}
2027
	}
2028

    
2029
	if (plot_common_nucs) {
2030
    if (is.null(aligned_inter_strain_nucs)) {
2031
      aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]]
2032
      if (!is.null(aligned_inter_strain_nucs)) {
2033
        aligned_inter_strain_nucs$cur_index = "foo" 
2034
      }
2035
    }
2036

    
2037
    #Plot common wp nucs
2038
    mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list()
2039
    for (replicate_rank in 1:length(replicates)) {
2040
      nb_rank[[replicate_rank]] = length(samples)
2041
      base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]]
2042
      ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]])
2043
      y_min[[replicate_rank]] = min(ylim[[replicate_rank]])
2044
      y_max[[replicate_rank]] = max(ylim[[replicate_rank]])
2045
      delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]]
2046
      mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2
2047

    
2048
      samples = replicates[[replicate_rank]]
2049
      for (sample_rank in 1:length(samples)) {
2050
        sample = samples[[sample_rank]]
2051
        y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]]
2052
        if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2053
          x_min[[replicate_rank]] = sample$roi[["begin"]]
2054
          x_max[[replicate_rank]] = sample$roi[["end"]]
2055
        } else {
2056
          x_min[[replicate_rank]] = - sample$roi[["begin"]]
2057
          x_max[[replicate_rank]] = - sample$roi[["end"]]
2058
        }
2059
        shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]]
2060
      }
2061
    }
2062
    print(aligned_inter_strain_nucs)
2063
    if (!is.null(aligned_inter_strain_nucs)) {
2064
      for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
2065
        inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
2066
        tmp_xs = tmp_ys = c()
2067
        for (replicate_rank in 1:length(replicates)) {
2068
          samples = replicates[[replicate_rank]]
2069
          strain = samples[[1]]$strain
2070
          tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]])
2071
          tmp_ys = c(tmp_ys, mid_y[[replicate_rank]])
2072
        }
2073
        lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]])*8, cex=dev.size()[1]*400/(x_max[[1]]-x_min[[1]]), pch=19)
2074
      }
2075
    }
2076
    
2077
    if (plot_common_unrs) {
2078
      combi = c(replicates[[1]][[1]]$strain, replicates[[2]][[1]]$strain)
2079
      roi = as.list(samples[[1]]$roi)
2080
      cur_index = "foo"
2081
      common_nuc_results = list()
2082
      common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = aligned_inter_strain_nucs
2083
      unrs = get_unrs(combi, roi, cur_index, wp_maps, fuzzy_maps, common_nuc_results, config = config) 
2084
      rect(sign(x_min[[1]]) * unrs$lower_bound + shift[[1]], y_min[[1]], sign(x_min[[1]]) * unrs$upper_bound + shift[[1]], y_max[[2]], border=4, lwd=10, col=adjustcolor(4, alpha.f = 0.05))        
2085
    }
2086

    
2087
	}
2088
  return(returned_list)
2089
}
2090

    
2091

    
2092

    
2093
get_intra_strain_fuzzy = function(# Compute the fuzzy list for a given strain.
2094
### This function grabs the nucleosomes detxted by template_filter that have been rejected bt aggregate_intra_strain_nucs as well positions.
2095
wp_map, ##<< Well positionned nucleosomes map.
2096
roi, ##<< The region of interest.
2097
strain, ##<< The strain we want to extracvt the fuzzy map.
2098
config=NULL ##<< GLOBAL config variable.
2099
) {
2100
  fuzzy_map = wp_map[wp_map$wp==0, ]
2101
  if (nrow(fuzzy_map) > 0) {
2102
    fuzzy_map = substract_region(fuzzy_map, wp_map[wp_map$wp==1,])
2103
    if (!is.null(fuzzy_map)) {
2104
      fuzzy_map = union_regions(fuzzy_map)
2105
      fuzzy_map = crop_fuzzy(fuzzy_map, roi, strain, config)
2106
    }
2107
  }
2108
  return(fuzzy_map)
2109
}
2110

    
2111

    
2112

    
2113

    
2114

    
2115
get_unrs = function(# Compute the unaligned nucleosomal regions (UNRs).
2116
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting UNR. It will be take into account in the count read part og the pipeline.
2117
combi, ##<< The strain combination to consider.
2118
roi, ##<< The region of interest.
2119
cur_index, ##<< The region of interest index.
2120
wp_maps, ##<< Well positionned nucleosomes maps.
2121
fuzzy_maps, ##<< Fuzzy nucleosomes maps.
2122
common_nuc_results, ##<< Common wp nuc maps
2123
config=NULL ##<< GLOBAL config variable
2124
) {
2125
  # print(cur_index)
2126

    
2127
  tmp_combi_key = paste(combi[1], combi[2], sep="_")
2128
  tmp_common_nucs = common_nuc_results[[tmp_combi_key]]
2129
  tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$cur_index == cur_index, ]
2130

    
2131
  # print(paste("Dealing with unr from", combi[1]))
2132
  tmp_fuzzy = fuzzy_maps[[combi[1]]]
2133
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ]
2134
  tmp_wp = wp_maps[[combi[1]]]
2135
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2136
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2137
  # Let's go!
2138
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2139
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2140
      return(NULL)
2141
    } else {
2142
      return (index_nuc)
2143
    }
2144
  }))
2145
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2146
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2147
  if (length(tmp_unr) != 0) {
2148
    tmp_unr = union_regions(tmp_unr)
2149
  }
2150
  tmp_unr_nucs_1 = tmp_unr
2151
  if (length(tmp_unr_nucs_1[,1]) == 0) {return(NULL)}
2152
  agg_unr_1 = tmp_unr_nucs_1
2153

    
2154
  # print(paste("Dealing with unr from ", combi[2]))
2155
  tmp_fuzzy = fuzzy_maps[[combi[2]]]
2156
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ]
2157
  tmp_wp = wp_maps[[combi[2]]]
2158
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2159
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2160
  # Let's go!
2161
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2162
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2163
      return(NULL)
2164
    } else {
2165
      return (index_nuc)
2166
    }
2167
  }))
2168
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2169
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2170
  if (length(tmp_unr) != 0) {
2171
    tmp_unr = union_regions(tmp_unr)
2172
  }
2173
  tmp_unr_nucs_2 = tmp_unr
2174
  if (length(tmp_unr_nucs_2[,1]) == 0) {return(NULL)}
2175
  agg_unr_2 = crop_fuzzy(tmp_unr_nucs_2, roi, combi[2], config)
2176
  tr_agg_unr_2 = translate_regions(agg_unr_2, combi, cur_index, roi=roi, config=config)
2177
  tr_agg_unr_2 = union_regions(tr_agg_unr_2)
2178

    
2179
  # print("Dealing with unr from both...")
2180
  all_unr = union_regions(rbind(agg_unr_1, tr_agg_unr_2))
2181

    
2182
  # print(paste("Dealing with wp from", combi[1]))
2183
  tmp_wp = wp_maps[[combi[1]]]
2184
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2185
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2186
  # Let's go!
2187
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2188
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2189
      return (index_nuc)
2190
    } else {
2191
      return(NULL)
2192
    }
2193
  }))
2194
  wp_nucs_1 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2195
  
2196
  # print(paste("Dealing with wp from", combi[2]))
2197
  tmp_wp = wp_maps[[combi[2]]]
2198
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2199
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2200
  # Let's go!
2201
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2202
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2203
      return (index_nuc)
2204
    } else {
2205
      return(NULL)
2206
    }
2207
  }))
2208
  wp_nucs_2 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2209
  wp_nucs_2 = crop_fuzzy(wp_nucs_2, roi, combi[2], config)
2210
  if (nrow(wp_nucs_2) == 0) {
2211
    tr_wp_nucs_2 = wp_nucs_2
2212
  } else {
2213
    tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, cur_index, roi=roi, config=config)      
2214
  }
2215

    
2216
  # print("Dealing with wp from both...")
2217
  all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2))
2218

    
2219
  # print("Dealing with unr and wp...")
2220
  non_inter_unr = substract_region(all_unr, all_wp)
2221
  non_inter_unr = crop_fuzzy(non_inter_unr, roi, combi[1], config)
2222
  if (is.null(non_inter_unr)) { return(NULL) }
2223
  non_inter_unr$len = non_inter_unr$upper_bound - non_inter_unr$lower_bound
2224
  min_unr_width = 75
2225
  non_inter_unr = non_inter_unr[non_inter_unr$len >= min_unr_width,]
2226

    
2227
  non_inter_unr$index_nuc = 1:length(non_inter_unr[,1])
2228
  return (non_inter_unr)
2229
}
2230

    
2231

    
2232