Statistiques
| Branche: | Révision :

root / src / R / nucleominer.R @ f0703205

Historique | Voir | Annoter | Télécharger (86,37 ko)

1
# get_content = function(# Get content from cached
2
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
3
# obj, ##<< The object that we want its content.
4
# ... ##<< Parameters that will be passed to my_read
5
# ) {
6
#   UseMethod("get_content", obj)
7
#   ### Returns the cached content of the current object.
8
# }
9
#
10
# get_content.default = structure( function(# Get content from cached
11
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
12
# obj, ##<< The object that we want its content.
13
# ... ##<< Parameters that will be passed to my_read
14
# ) {
15
#   if (inherits(try(NM_CACHE,TRUE), "try-error") || is.null(NM_CACHE)) {
16
#     NM_CACHE <<- list()
17
#   }
18
#   if (is.null(NM_CACHE[[obj$filename]])) {
19
#     print(paste("Loading file ",obj$filename, sep=""))
20
#     tmp_content = my_read(obj, ...)
21
#     print("affect it...")
22
#     NM_CACHE[[obj$filename]] <<- tmp_content
23
#     print("done.")
24
#   }
25
#   return(NM_CACHE[[obj$filename]])
26
# ### Returns the cached content of the current object.
27
# }, ex=function(){
28
#   # Create a dataframe
29
#   df = NULL
30
#   df = dfadd(df, list(key1 = "value1", key2 = "value2"))
31
#   df = dfadd(df, list(key1 = "value1'", key2 = "value3'"))
32
#   # Dump it into tmp file
33
#   write.table(df, file="/tmp/tmp.dump.table.tmp")
34
#   # Load it into cache using feature.
35
#   cached_content_object  = list(filename="/tmp/tmp.dump.table.tmp")
36
#   class(cached_content_object) = "table"
37
#   # First time it will be load into cache
38
#   print(get_content(cached_content_object))
39
#   # Second time not
40
#   print(get_content(cached_content_object))
41
#
42
# })
43
#
44
# my_read = function(
45
# ### Abstract my_read function.
46
#   obj, ...) {
47
#   UseMethod("my_read", obj)
48
# }
49
#
50
# my_read.default = function(
51
# ### Default my_read function.
52
#   obj, ...){
53
#   stop(paste("ERROR, my_read is not defined for any Objects (file ", obj$filename," )", sep=""))
54
# }
55
#
56
# my_read.fasta = function(
57
# ### my_read function for fasta files.
58
#   obj, ...){
59
#   # require(seqinr)
60
#   return(read.fasta(obj$filename, ...))
61
# }
62
#
63
# my_read.table = function(
64
# ### my_read function for table files.
65
#   obj, ...){
66
#   if (rev(unlist(strsplit(obj$filename, ".", fixed=TRUE)))[1] == "gz") {
67
#     return(read.table(file=gzfile(obj$filename), ...))
68
#   } else {
69
#     return(read.table(file=obj$filename, ...))
70
#   }
71
# }
72
#
73
# my_read.cvs = function(
74
# ### my_read function for cvs files.
75
#   obj, ...){
76
#   return(read.csv(file=obj$filename, ...))
77
# }
78

    
79

    
80

    
81

    
82

    
83

    
84

    
85

    
86

    
87
FDR = structure(function#  False Discovery Rate
88
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper
89
##author<< Gael Yvert,
90
(
91
x, ##<< A vector x of independent p-values.
92
FDR ##<< The specified FDR.
93
) {
94
  x <- sort(na.omit(x))
95
  N = length(x)
96
  i = 1;
97
  while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR
98
  if (i == 1)
99
    return (NA)
100
  else
101
    return( x[i-1] )
102
### Return the the corresponding cutoff.
103
}, ex=function(){
104
  print("example")
105
})
106

    
107

    
108

    
109
lod_score_vecs = structure(function # Likelihood ratio
110
### Compute the likelihood log of two set of value from two models Vs. a unique model.
111
(
112
x ,##<< First vector.
113
y ##<< Second vector.
114
) {
115
	if (length(x) <=1 | length(y) <= 1) {
116
		return(NA)
117
	}
118
  meanX = mean(x)
119
  sdX = sd(x)
120
  meanY = mean(y)
121
  sdY = sd(y)
122
  meanXY = mean(c(x,y))
123
  sdXY = sd(c(x,y))
124
  llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
125
  llY = sum(log(dnorm(y,mean=meanY,sd=sdY)))
126
  llXY = sum(log(dnorm(c(x,y),mean=meanXY,sd=sdXY)))
127
  ratio = llX + llY - llXY
128
  return(ratio)
129
### Returns the likelihood ratio.
130
}, ex=function(){
131
  # LOD score for 2 set of values
132
  mean1=5; sd1=2; card2 = 250
133
  mean2=6; sd2=3; card1 = 200
134
  x1 = rnorm(card1, mean1, sd1)
135
  x2 = rnorm(card2, mean2, sd2)
136
  min = floor(min(c(x1,x2)))
137
  max = ceiling(max(c(x1,x2)))
138
  hist(c(x1,x2), xlim=c(min, max), breaks=min:max)
139
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
140
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
141
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
142
  lod_score_vecs(x1,x2)
143
 })
144

    
145
dfadd = structure(function# Adding list to a dataframe.
146
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}.
147
	(df, ##<<  A dataframe
148
		l ##<<  A list
149
	) {
150
  if (is.null(df)) {
151
    df = data.frame(l,stringsAsFactors=FALSE)
152
  } else {
153
    df = rbind(df, data.frame(l,stringsAsFactors=FALSE))
154
  }
155
  return(df)
156
### Return the dataframe \emph{df}.
157
}, ex=function(){
158
		## Here dataframe is NULL
159
		print(df)
160
		df = NULL
161

    
162
		# Initialize df
163
		df = dfadd(df, list(key1 = "value1", key2 = "value2"))
164
		print(df)
165

    
166
		# Adding elements to df
167
		df = dfadd(df, list(key1 = "value1'", key2 = "value2'"))
168
		print(df)
169
})
170

    
171

    
172
sign_from_strand = function(
173
### Get the sign of strand
174
strands) {
175
	apply(t(strands), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
176
### If strand in forward then returns 1 else returns -1
177
}
178

    
179
flat_reads = function(
180
### Extract reads coordinates from TempleteFilter input sequence
181
reads, ##<< TemplateFilter input reads
182
nuc_width ##<< Width used to shift F and R reads.
183
) {
184
	F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])}))
185
	R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])}))
186
	flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads))  )
187
	return(list(F_flatted_reads, R_flatted_reads, flatted_reads))
188
### Returns a list of F reads, R reads and joint/shifted F and R reads.
189
}
190

    
191

    
192

    
193
filter_tf_outputs = function(# Filter TemplateFilter outputs
194
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlap threshold.
195
tf_outputs, ##<< TemplateFilter outputs.
196
chr, ##<< Chromosome observed, here chr is an integer.
197
x_min, ##<< Coordinate of the first bp observed.
198
x_max, ##<< Coordinate of the last bp observed.
199
nuc_width = 160, ##<< Nucleosome width.
200
ol_bp = 59, ##<< Overlap Threshold.
201
corr_thres = 0.5 ##<< Correlation threshold.
202
) {
203
  if (x_min < 0) {
204
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center > (-x_max - nuc_width/2) & tf_outputs$center <  (-x_min + nuc_width/2),]
205
	} else {
206
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center > (x_min - nuc_width/2) & tf_outputs$center < (x_max + nuc_width/2),]
207
  }
208
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
209
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
210
  tf_outputs = tf_outputs[tf_outputs$correlation >= corr_thres,]
211
  tf_outputs = tf_outputs[order(tf_outputs$correlation,decreasing=TRUE),]
212
  i = 1
213
  while (i <= length(tf_outputs[,1])) {
214
    lb = tf_outputs[i,]$low
215
    ub = tf_outputs[i,]$up
216
    tf_outputs = tf_outputs[!(tf_outputs$low <= (ub-ol_bp) & tf_outputs$up > ub) & !(tf_outputs$up >= (lb+ol_bp) & tf_outputs$low < lb),]
217
    i = i+1
218
  }
219
  return(tf_outputs)
220
### Returns filtered TemplateFilter Outputs
221
}
222

    
223

    
224

    
225
filter_tf_inputs = function(# Filter TemplateFilter inputs
226
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
227
inputs, ##<< TF inputs to be filtered.
228
chr, ##<< Chromosome observed, here chr is an integer.
229
x_min, ##<< Coordinate of the first bp observed.
230
x_max, ##<< Coordinate of the last bp observed.
231
nuc_width = 160, ##<< Nucleosome width.
232
only_f = FALSE, ##<< Filter only F reads.
233
only_r = FALSE, ##<< Filter only R reads.
234
filter_for_coverage = FALSE ##<< Does it filter for plot coverage?
235
) {
236
	if (only_f) {
237
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
238
	} else if (only_r) {
239
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]
240
	} else {
241
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
242
	}
243
  if (!filter_for_coverage) {
244
    corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
245
    inputs = inputs[inputs[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]
246
  }
247
	return(inputs)
248
### Returns filtred inputs.
249
}
250

    
251

    
252

    
253
get_comp_strand = function(
254
### Compute the complementatry strand.
255
strand ##<< The original strand.
256
) {
257
	apply(t(strand),2, function(n){
258
	  if (n=="a") {return("t")}
259
		if (n=="t") {return("a")}
260
		if (n=="c") {return("g")}
261
		if (n=="g") {return("c")}
262
	})
263
### Returns the complementatry strand.
264
}
265

    
266

    
267
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes.
268
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{lod_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
269
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
270
lod_thres=20, ##<< Log likelihood ration threshold.
271
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
272
){
273
	end_of_tracks = function(tracks) {
274
		if (length(tracks) == 0) {
275
			return(TRUE)
276
		}
277
	  for (lower_bound in tracks) {
278
			if(!is.na(lower_bound)) {
279
	      if (lower_bound < coord_max) {
280
	        return(FALSE)
281
	      }
282
	  	}
283
	  }
284
	  return(TRUE)
285
	}
286
	store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,nb_tracks, min_nuc_center, max_nuc_center) {
287
		if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) {
288
			new_cluster$wp = TRUE
289
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
290
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
291
		  	clusters[[length(clusters) + 1]] = new_cluster
292
				# print(new_cluster)
293
		  }
294
		} else {
295
			new_cluster$wp = FALSE
296
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
297
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
298
			  clusters[[length(clusters) + 1]] = new_cluster
299
			}
300
		}
301
		return(clusters)
302
	}
303
	strain = samples[[1]]$strain
304
	lod_scores = c()
305
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
306
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
307
  # compute clusters
308
  clusters = list()
309
  cluster_contents = list()
310
  # Init reader
311
  indexes = c()
312
  track_readers = c()
313
  current_nuc = NULL
314
	lod_score = lod_thres + 1
315
  # Read nucs from TF outputs
316
  tf_outs = list()
317
	i = 1
318
  for (sample in samples) {
319
		# print(sample$roi$chr)
320
		# print(min_nuc_center)
321
		# print(max_nuc_center)
322
		# print(sample$outputs)
323
		# tf_outs[[i]] = filter_tf_outputs(sample$outputs, sample$roi$chr, min_nuc_center, max_nuc_center)
324
		# print(tf_outs[[i]])
325
		tf_outs[[i]] = sample$outputs
326
		tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),]
327
    indexes[i] = 1
328
		if (is.na(tf_outs[[i]][indexes[i],]$center)) {
329
      track_readers[i] = coord_max
330
	  } else {
331
      track_readers[i] = tf_outs[[i]][indexes[i],]$center
332
		}
333
		i = i+1
334
  }
335
	# print(track_readers)
336
  new_cluster = NULL
337
  nb_nucs_in_cluster = 0
338
  nuc_from_track = c()
339
  for (i in 1:length(tf_outs)){
340
    nuc_from_track[i] = FALSE
341
  }
342
  # Start clustering
343
  while (!end_of_tracks(track_readers)) {
344
    new_center = min(track_readers)
345
		current_track = which(track_readers == new_center)[1]
346
    new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],])
347
		new_nuc$chr = substr(new_nuc$chr,4,1000000L)
348
		new_nuc$inputs = samples[[current_track]]$inputs
349
		new_nuc$chr = samples[[current_track]]$roi$chr
350
		new_nuc$track = current_track
351

    
352
		new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width)
353
		flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width)
354
		new_nuc$original_reads = flatted_reads[[3]]
355

    
356
    new_upper_bound = new_nuc$upper_bound
357

    
358
    if (!is.null(current_nuc)) {
359
			lod_score = lod_score_vecs(current_nuc$original_reads,new_nuc$original_reads)
360
			lod_scores = c(lod_scores,lod_score)
361
		}
362
		# print(paste(lod_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
363
		if (is.na(lod_score)) {
364
			lod_score = lod_thres + 1
365
		}
366
		# Store lod_score
367
		new_nuc$lod_score = lod_score
368
	  if (lod_score < lod_thres) {
369
      # aggregate to current cluster
370
      #   update bound
371
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
372
        new_cluster$upper_bound = new_nuc$upper_bound
373
      }
374
      if (new_nuc$lower_bound < new_cluster$lower_bound) {
375
        new_cluster$lower_bound = new_nuc$lower_bound
376
      }
377
      #   add nucleosome to current cluster
378
      nuc_from_track[current_track] = TRUE
379
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
380
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
381
    } else {
382
			if (!is.null(new_cluster)) {
383
        # store old cluster
384
	      clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
385
			}
386
      # Reinit current cluster composition stuff
387
      nb_nucs_in_cluster = 0
388
      nuc_from_track = c()
389
      for (i in 1:length(tf_outs)){
390
        nuc_from_track[i] = FALSE
391
      }
392
      # create new cluster
393
      new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list())
394
      # update upper bound
395
      current_upper_bound = new_upper_bound
396
      # add nucleosome to current cluster
397
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
398
      nuc_from_track[current_track] = TRUE
399
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
400

    
401
		}
402

    
403
		current_nuc = new_nuc
404

    
405
    # update indexes
406
    if (indexes[current_track] < length(tf_outs[[current_track]]$center)) {
407
      indexes[current_track] = indexes[current_track] + 1
408
      # update track
409
      track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center
410
    } else {
411
      # update track
412
      track_readers[current_track] = coord_max
413
    }
414
  }
415
  # store last cluster
416
  if (!is.null(new_cluster)) {
417
    # store old cluster
418
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
419
  }
420
	return(list(clusters, lod_scores))
421
### Returns a list of clusterized nucleosomes, and all computed lod scores.
422
}, ex=function(){
423
	# Dealing with a region of interest
424
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
425
	samples = list()
426
	for (i in 1:3) {
427
		# Create TF output
428
		tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
429
		outputs = dfadd(NULL,tf_nuc)
430
		outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
431
		# Generate corresponding reads
432
		nb_reads = round(runif(1,170,230))
433
		reads = round(rnorm(nb_reads, tf_nuc$center,20))
434
		u_reads = sort(unique(reads))
435
		strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
436
		counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
437
		shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
438
		u_reads = u_reads + shifts
439
		inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
440
		                         "V2" = u_reads,
441
														 "V3" = strands,
442
														 "V4" = counts), stringsAsFactors=FALSE)
443
		samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
444
	}
445
	print(aggregate_intra_strain_nucs(samples))
446
})
447

    
448
flat_aggregated_intra_strain_nucs = function(# to flat aggregate_intra_strain_nucs function output
449
### This function builds a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
450
partial_strain_maps, ##<< the output of aggregate_intra_strain_nucs function
451
roi_index ##<< the index of the roi involved
452
) {
453
	if  (length(partial_strain_maps) == 0 ){
454
		print(paste("Empty partial_strain_maps for roi", roi_index, "ands current strain." ))
455
    tmp_strain_maps = list()
456
	} else {
457
		tmp_strain_map = apply(t(1:length(partial_strain_maps)), 2, function(i){
458
			tmp_nuc = partial_strain_maps[[i]]
459
			tmp_nuc_as_list = list()
460
			tmp_nuc_as_list[["chr"]] = tmp_nuc[["chr"]]
461
			tmp_nuc_as_list[["lower_bound"]] = ceiling(tmp_nuc[["lower_bound"]])
462
			tmp_nuc_as_list[["upper_bound"]] = floor(tmp_nuc[["upper_bound"]])
463
			tmp_nuc_as_list[["roi_index"]] = roi_index
464
			tmp_nuc_as_list[["index_nuc"]] = i
465
			tmp_nuc_as_list[["wp"]] = as.integer(tmp_nuc$wp)
466
			all_original_reads = c()
467
			for (j in 1:length(tmp_nuc$nucs)) {
468
				all_original_reads = c(all_original_reads, tmp_nuc$nucs[[j]]$original_reads)
469
			}
470
			tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads)
471
			if (tmp_nuc$wp) {
472
				tmp_nuc_as_list[["lod_1"]] = signif(tmp_nuc$nucs[[2]]$lod_score,5)
473
				tmp_nuc_as_list[["lod_2"]] = signif(tmp_nuc$nucs[[3]]$lod_score,5)
474
			} else {
475
				tmp_nuc_as_list[["lod_1"]] = NA
476
				tmp_nuc_as_list[["lod_2"]] = NA
477
			}
478
      return(tmp_nuc_as_list)
479
    })
480
    tmp_strain_maps = do.call("rbind", tmp_strain_map)
481
	}
482
  return(data.frame(lapply(data.frame(tmp_strain_maps, stringsAsFactors=FALSE), unlist), stringsAsFactors=FALSE))
483
### Returns a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
484
}
485

    
486
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains.
487
### This function aligns nucs between two strains for a given genome region.
488
replicates, ##<< Set of replicates, ideally 3 per strain.
489
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's null this list will be computed.
490
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's null this list will be computed.
491
corr_thres=0.5, ##<< Correlation threshold.
492
lod_thres=100, ##<< LOD cut off.
493
config=NULL, ##<< GLOBAL config variable
494
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.
495
) {
496
	if (length(replicates) < 2) {
497
		stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.")
498
	} else if (length(replicates) > 2) {
499
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")
500
	}
501
	common_nuc = NULL
502
	lod_scores = c()
503
	chr = replicates[[1]][[1]]$roi$chr
504
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
505
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
506
	strain_ref1 = replicates[[1]][[1]]$strain
507
	strain_ref2 = replicates[[2]][[1]]$strain
508
	big_roi = replicates[[1]][[1]]$roi
509
  orig_big_roi = replicates[[1]][[1]]$orig_roi
510
	if(big_roi$end - big_roi$begin < 0) {
511
		tmp_begin = big_roi$begin
512
		big_roi$begin =  big_roi$end
513
		big_roi$end =  tmp_begin
514
	}
515
	# GO!
516
	if (is.null(wp_nucs_strain_ref1)) {
517
		wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]]
518
	}
519
	if (is.null(wp_nucs_strain_ref2)) {
520
	  wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]]
521
  }
522
  # foo <<- wp_nucs_strain_ref1
523
  # print(apply(t(wp_nucs_strain_ref1), 2, function(l){c(l[[1]]$lower_bound, l[[1]]$upper_bound, l[[1]]$wp)}))
524
  # print(apply(t(wp_nucs_strain_ref2), 2, function(l){c(l[[1]]$lower_bound, l[[1]]$upper_bound, l[[1]]$wp)}))
525
	# dealing with matching_nas
526
	lws = c()
527
	ups = c()
528
	for (na in wp_nucs_strain_ref2) {
529
		lws = c(lws, na$lower_bound)
530
		ups = c(ups, na$upper_bound)
531
	}
532

    
533
	print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep=""))
534
	roi_strain_ref1 = NULL
535
	roi_strain_ref2 = NULL
536
	if (length(wp_nucs_strain_ref1) > 0) {
537
		for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){
538
			# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep=""))
539
			nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]]
540
			# Filtering on Well Positionned
541
			if (nuc_strain_ref1$wp) {
542
				roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1)
543
				roi_strain_ref2 = translate_roi(roi_strain_ref1, strain_ref2, big_roi=orig_big_roi, config=config)
544
        if (!is.null(roi_strain_ref2)){
545
					# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER)
546
					lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin)
547
					upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin)
548
					matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2)
549
					for (index_nuc_strain_ref2 in matching_nas) {
550
						nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]]
551
						# Filtering on Well Positionned
552
    				nuc_strain_ref2_to_roi = list(begin=nuc_strain_ref2$lower_bound, end=nuc_strain_ref2$upper_bound, chr=nuc_strain_ref2$chr, strain_ref = strain_ref2)
553
						if (!is.null(translate_roi(nuc_strain_ref2_to_roi, strain_ref1, big_roi=orig_big_roi, config=config)) &
554
                nuc_strain_ref2$wp) {
555
							# Filtering on correlation Score and collecting reads
556
							SKIP = FALSE
557
							# TODO: This for loop could be done before working on strain_ref2. Isn't it?
558
							reads_strain_ref1 = c()
559
							for (nuc in nuc_strain_ref1$nucs){
560
								reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads)
561
								if (nuc$corr < corr_thres) {
562
									SKIP = TRUE
563
								}
564
							}
565
							reads_strain_ref2 = c()
566
							for (nuc in nuc_strain_ref2$nucs){
567
								reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads)
568
								if (nuc$corr < corr_thres) {
569
									SKIP = TRUE
570
								}
571
							}
572
							# Filtering on correlation Score
573
							if (!SKIP) {
574
								# tranlation of reads into strain 2 coords
575
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
576
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
577
								lod_score = lod_score_vecs(reads_strain_ref1, reads_strain_ref2)
578
								lod_scores = c(lod_scores, lod_score)
579
								# Filtering on LOD Score
580
								if (lod_score < lod_thres) {
581
									tmp_nuc = list()
582
									# strain_ref1
583
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
584
									tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound
585
									tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound
586
									tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5)
587
									tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5)
588
									tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1)
589
									tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1
590
									# tmp_nuc[[paste("corr1_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[1]]$corr,5)
591
									# tmp_nuc[[paste("corr2_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[2]]$corr,5)
592
									# tmp_nuc[[paste("corr3_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[3]]$corr,5)
593
									# strain_ref2
594
									tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr
595
									tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound
596
									tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound
597
									tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5)
598
									tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5)
599
									tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2)
600
									tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2
601
									# tmp_nuc[[paste("corr1_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[1]]$corr,5)
602
									# tmp_nuc[[paste("corr2_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[2]]$corr,5)
603
									# tmp_nuc[[paste("corr3_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[3]]$corr,5)
604
									# common
605
									tmp_nuc[["lod_score"]] = signif(lod_score,5)
606
									# print(tmp_nuc)
607
									common_nuc = dfadd(common_nuc, tmp_nuc)
608
								}
609
							}
610
						}
611
					}
612
				} else {
613
		      print("WARNING! No roi for strain ref 2.")
614
			  }
615
		  }
616
		}
617

    
618
		if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) {
619
			index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] ==  common_nuc[,1:3][-1,] ,1,sum) == 3)
620
			to_remove_list = c()
621
			for (i in 1:length(index_redundant)) {
622
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
623
				  to_remove = index_redundant[i]
624
				}	 else {
625
					to_remove = index_redundant[i] + 1
626
			  }
627
				to_remove_list = c(to_remove_list, to_remove)
628
			}
629
			common_nuc = common_nuc[-to_remove_list,]
630
		}
631

    
632
		if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) {
633
			index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] == common_nuc[,8:10][-1,] ,1,sum) == 3)
634
			to_remove_list = c()
635
			for (i in 1:length(index_redundant)) {
636
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
637
				  to_remove = index_redundant[i]
638
				}	 else {
639
					to_remove = index_redundant[i] + 1
640
			  }
641
				to_remove_list = c(to_remove_list, to_remove)
642
			}
643
			common_nuc = common_nuc[-to_remove_list,]
644
		}
645

    
646
		return(list(common_nuc, lod_scores))
647
	} else {
648
		print("WARNING, no nucs for strain_ref1.")
649
		return(NULL)
650
	}
651
### Returns a list of clusterized nucleosomes, and all computed lod scores.
652
}, ex=function(){
653

    
654
    # Define new translate_roi function...
655
    translate_roi = function(roi, strain2, big_roi=NULL, config=NULL) {
656
      return(roi)
657
    }
658
    # Binding it by uncomment follwing lines.
659
    unlockBinding("translate_roi", as.environment("package:nucleominer"))
660
    unlockBinding("translate_roi", getNamespace("nucleominer"))
661
    assign("translate_roi", translate_roi, "package:nucleominer")
662
    assign("translate_roi", translate_roi, getNamespace("nucleominer"))
663
    lockBinding("translate_roi", getNamespace("nucleominer"))
664
    lockBinding("translate_roi", as.environment("package:nucleominer"))
665

    
666
	# Dealing with a region of interest
667
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1")
668
	roi2 = translate_roi(roi, roi$strain_ref1)
669
	replicates = list()
670
	for (j in 1:2) {
671
		samples = list()
672
		for (i in 1:3) {
673
			# Create TF output
674
			tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
675
			outputs = dfadd(NULL,tf_nuc)
676
			outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
677
			# Generate corresponding reads
678
			nb_reads = round(runif(1,170,230))
679
			reads = round(rnorm(nb_reads, tf_nuc$center,20))
680
			u_reads = sort(unique(reads))
681
			strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
682
			counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
683
			shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
684
			u_reads = u_reads + shifts
685
			inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
686
			                         "V2" = u_reads,
687
															 "V3" = strands,
688
															 "V4" = counts), stringsAsFactors=FALSE)
689
			samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
690
		}
691
		replicates[[length(replicates) + 1]] = samples
692
	}
693
	print(align_inter_strain_nucs(replicates))
694
})
695

    
696

    
697

    
698

    
699

    
700

    
701

    
702

    
703

    
704

    
705

    
706

    
707

    
708

    
709

    
710

    
711

    
712

    
713

    
714

    
715

    
716

    
717

    
718

    
719

    
720

    
721
fetch_mnase_replicates = function(# Prefetch data
722
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates.
723
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids.
724
roi, ##<< Region of interest.
725
all_samples, ##<< Global list of samples.
726
config=NULL, ##<< GLOBAL config variable
727
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking.
728
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence.
729
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files.
730
) {
731
	samples=list()
732
  samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id)
733
	for (i in samples_ids) {
734
		sample = as.list(all_samples[all_samples$id==i,])
735
    sample$orig_roi = roi
736
    sample$roi = translate_roi(roi, sample$strain, config = config)
737
		if (get_genome) {
738
			# Get Genome
739
      sample$roi$genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]], "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
740
		}
741
		# Get inputs
742
		sample$inputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), "table", stringsAsFactors=FALSE)
743
		sample$total_reads = sum(sample$inputs[,4])
744
		if (!only_fetch) {
745
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
746
	  }
747
	  # Get TF outputs for Mnase_Seq samples
748
		if (sample$marker == "Mnase_Seq" & get_ouputs) {
749
			sample$outputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), "table", header=TRUE, sep="\t")
750
			if (!only_fetch) {
751
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
752
  		}
753
		}
754
		samples[[length(samples) + 1]] = sample
755
	}
756
  return(samples)
757
}
758

    
759
substract_region = function(# Substract to a list of regions an other list of regions that intersect it.
760
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two.
761
region1, ##<< Original regions.
762
region2 ##<< Regions to substract.
763
) {
764
  rec_substract_region = function(region1, region2) {
765
  non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) {
766
    cur_fuzzy = region1[i,]
767
    inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,]
768
    if (length(inter_wp[,1]) > 0) {
769
      ret = c()
770
      for (j in 1:length(inter_wp[,1])) {
771
        cur_wp = inter_wp[j,]
772
        if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
773
          # remove cur_fuzzy
774
          ret = c()
775
          break
776
        } else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
777
          # crop fuzzy
778
          cur_fuzzy$lower_bound = cur_wp$upper_bound + 1
779
          ret = cur_fuzzy
780
        } else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
781
          # crop fuzzy
782
          cur_fuzzy$upper_bound = cur_wp$lower_bound - 1
783
          ret = cur_fuzzy
784
        } else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
785
          # split fuzzy
786
          tmp_ret_fuzzy_1 = cur_fuzzy
787
          tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1
788
          tmp_ret_fuzzy_2 = cur_fuzzy
789
          tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1
790
          ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp)
791
          # print(ret)
792
          # ret = cur_fuzzy
793
          break
794
        } else {
795
          stop("WARNING NO ADAPTED CASE!")
796
        }
797
      }
798
      return(ret)
799
    } else {
800
      return(cur_fuzzy)
801
    }
802
  })
803
  }
804
  non_inter_fuzzy = rec_substract_region(region1[,1:4], region2[,1:4])
805
  if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)}
806
  tmp_ulist = unlist(non_inter_fuzzy)
807
  tmp_names = names(tmp_ulist)[1:4]
808
  non_inter_fuzzy = data.frame(matrix(tmp_ulist, ncol=4, byrow=TRUE), stringsAsFactors=FALSE)
809
  names(non_inter_fuzzy) = tmp_names
810
  non_inter_fuzzy$chr = as.character(non_inter_fuzzy$chr)
811
  non_inter_fuzzy$chr = as.numeric(non_inter_fuzzy$chr)
812
  non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound)
813
  non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound)
814
  non_inter_fuzzy = non_inter_fuzzy[order(non_inter_fuzzy$lower_bound),]
815
  return(non_inter_fuzzy)
816
}
817

    
818
union_regions = function(# Aggregate regions that intersect themnselves.
819
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable.
820
regions ##<< The Regions to be aggregated
821
) {
822
  if (is.null(regions)) {return(regions)}
823
  if (nrow(regions) == 0) {return(regions)}
824
  old_length = length(regions[,1])
825
  new_length = 0
826

    
827
  while (old_length != new_length) {
828
    regions = regions[order(regions$lower_bound), ]
829
    regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 1, TRUE)
830
    vec_end_1 = which(regions$stop)
831
    if (length(vec_end_1) == 0) {
832
      vec_end_1 = c(length(regions$stop))
833
    }
834
    if (vec_end_1[length(vec_end_1)] != length(regions$stop)) {
835
      vec_end_1 = c(vec_end_1, length(regions$stop))
836
    }
837
    vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1)
838
    union = apply(t(1:length(vec_beg_1)), 2, function(i) {
839
      chr = regions$chr[vec_beg_1[i]]
840
      lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]])
841
      upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]])
842
      roi_index = regions$roi_index[vec_beg_1[i]]
843
      data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, roi_index=roi_index))
844
      })
845
    union = collapse_regions(union)
846
    old_length = length(regions[,1])
847
    new_length = length(union[,1])
848
    regions = union
849
  }
850
  return(union)
851
}
852

    
853
# remove_aligned_wp = function(# Remove wp nucs from common nucs list.
854
# ### It is based on common wp nucs index on nucs and region.
855
# strain_maps, ##<< Nuc maps.
856
# roi_index, ##<< The region of interest index.
857
# tmp_common_nucs, ##<< the list of wp nucs.
858
# strain##<< The strain to consider.
859
# ){
860
#   fuzzy_nucs = strain_maps[[strain]]
861
#   fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$roi_index == roi_index,]
862
#   fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),]
863
#   if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)}
864
#   if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"}
865
#   anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]
866
#   fuzzy_nucs = fuzzy_nucs[-anti_index_1,]
867
#   return(fuzzy_nucs)
868
# }
869

    
870
translate_regions = function(# Translate a list of regions from a strain ref to another.
871
### This function is an eloborated call to translate_roi.
872
regions, ##<< Regions to be translated.
873
combi, ##<< Combination of strains.
874
roi_index, ##<< The region of interest index.
875
config=NULL, ##<< GLOBAL config variable
876
roi ##<< The region of interest.
877
) {
878
  tr_regions = apply(t(1:length(regions[,1])), 2, function(i) {
879
    tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2])
880
    big_roi =  roi
881
    trs_tmp_regions_ref2 = translate_roi(tmp_regions_ref2, combi[1], config = config, big_roi = big_roi)
882
    if (is.null(trs_tmp_regions_ref2)) {
883
      return(NULL)
884
    } else {
885
      return(data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), roi_index=roi_index)))
886
    }
887
  })
888

    
889
  return(collapse_regions(tr_regions))
890
}
891

    
892
collapse_regions = function(# reformat an "apply  manipulated" list of regions
893
### Utils to reformat an "apply  manipulated" list of regions
894
regions ##< a list of regions
895
) {
896
  if (is.null(regions)) {
897
    return(NULL)
898
  } else {
899
    regions = do.call(rbind, regions)
900
    regions$chr = as.character(regions$chr)
901
    regions$chr = as.numeric(regions$chr)
902
    regions$lower_bound = as.numeric(regions$lower_bound)
903
    regions$upper_bound = as.numeric(regions$upper_bound)
904
    regions = regions[order(regions$lower_bound),]
905
    return(regions)
906
  }
907
}
908

    
909

    
910
crop_fuzzy = function(# Crop bound of regions according to region of interest bound
911
### The fucntion is no more necessary since we remove "big_roi" bug in translate_roi function.
912
tmp_fuzzy_nucs, ##<< the regiuons to be croped.
913
roi, ##<< The region of interest.
914
strain, ##<< The strain to consider.
915
config=NULL ##<< GLOBAL config variable
916
) {
917
  tr_roi = translate_roi(roi, strain, config = config)
918
  tr_roi_begin = min(tr_roi$begin, tr_roi$end)
919
  tr_roi_end = max(tr_roi$begin, tr_roi$end)
920
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) {
921
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin
922
  }
923
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) {
924
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin
925
  }
926
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) {
927
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end
928
  }
929
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) {
930
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end
931
  }
932
  tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,]
933
  return(tmp_fuzzy_nucs)
934
}
935

    
936
get_all_reads = function(# Retrieve Reads
937
### Retrieve reads for a given marker, combi, form.
938
marker, ##<< The marker to considere.
939
combi, ##<< The starin combination to considere.
940
form="wp", ##<< The nuc form to considere.
941
config=NULL ##<< GLOBAL config variable
942
) {
943
	all_reads = NULL
944
  for (manip in c("Mnase_Seq", marker)) {
945
    if (form == "fuzzy") {
946
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_fuzzy_and_nbreads.tab",sep="")
947
  		tmp_res = read.table(file=out_filename, header=TRUE)
948
			tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,]
949
      tmp_res$form = form
950
    } else if (form == "wp") {
951
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
952
  		tmp_res = read.table(file=out_filename, header=TRUE)
953
      tmp_res$form = form
954
    } else if (form == "wpfuzzy") {
955
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
956
  		tmp_res = read.table(file=out_filename, header=TRUE)
957
      tmp_res$form = "wp"
958
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_fuzzy_and_nbreads.tab",sep="")
959
  		tmp_res2 = read.table(file=out_filename, header=TRUE)
960
			tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,]
961
      tmp_res2$form = "fuzzy"
962
      tmp_res = rbind(tmp_res, tmp_res2)
963
    }
964
		if (is.null(all_reads)) {
965
			all_reads = tmp_res[,c(1:9,length(tmp_res))]
966
		}
967
		tmp_res = tmp_res[,-c(1:9,length(tmp_res))]
968
		all_reads = cbind(all_reads, tmp_res)
969
  }
970
  return(all_reads)
971
}
972

    
973
get_design = function(# Build the design for deseq
974
### This function build the design according sample properties.
975
marker, ##<< The marker to considere.
976
combi, ##<< The starin combination to considere.
977
all_samples ##<< Global list of samples.
978
) {
979
  off1 = 0
980
  off2 = 0
981
	manips = c("Mnase_Seq", marker)
982
	design_rownames = c()
983
	design_manip = c()
984
	design_strain = c()
985
  off2index = function(off) {
986
  	switch(toString(off),
987
  		"1"=c(0,1,1),
988
  	  "2"=c(1,0,1),
989
    	"3"=c(1,1,0),
990
  		c(1,1,1)
991
  		)
992
  }
993
	for (manip in manips) {
994
		tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) &  all_samples$marker == manip), ]
995
		tmp_samples = tmp_samples[order(tmp_samples$strain), ]
996
		if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) {
997
			tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,]
998
		} else {
999
			if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) {
1000
				tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,]
1001
			}
1002
		}
1003
		design_manip = c(design_manip, rep(manip, length(tmp_samples$id)))
1004
		for (strain in combi) {
1005
			cols = apply(t(tmp_samples[ (tmp_samples$strain == strain &  tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")})
1006
			design_strain = c(design_strain, rep(strain, length(cols)))
1007
			design_rownames = c(design_rownames, cols)
1008
		}
1009
	}
1010
	snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain)
1011
	return(snep_design)
1012
}
1013

    
1014
plot_dist_samples = function(# Plot the distribution of reads.
1015
### This fuxntion use the deseq nomalization feature to compare qualitatively the distribution.
1016
strain, ##<< The strain to considere.
1017
marker, ##<< The marker to considere.
1018
res, ##<< Data
1019
all_samples, ##<< Global list of samples.
1020
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot.
1021
) {
1022
	cols = apply(t(all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")})
1023
	snepCountTable = res[,cols]
1024
	snepDesign = data.frame(
1025
		row.names = cols,
1026
		manip = rep(marker, length(cols)),
1027
		strain = rep(strain, length(cols))
1028
		)
1029
	cdsFull = newCountDataSet(snepCountTable, snepDesign)
1030
	sizeFactors = estimateSizeFactors(cdsFull)
1031
	# print(sizeFactors[[1]])
1032
	sample_ids = all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id
1033
	if (NEWPLOT) {
1034
		plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker))
1035
		NEWPLOT = FALSE
1036
	}
1037
	for (it in 1:length(sample_ids)) {
1038
		sample_id = sample_ids[it]
1039
		lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it)
1040
	}
1041
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
1042
}
1043

    
1044
analyse_design = function(# Launch deseq methods.
1045
### This function is based on deseq example. It mormalizes data, fit data to GLM model with and without interaction term and compare the two l;=models.
1046
snep_design, ##<< The design to considere.
1047
reads ##<< The data to considere.
1048
) {
1049
	snep_count_table = reads[, rownames(snep_design)]
1050
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1051
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1052
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1053

    
1054
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1055
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1056

    
1057
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1058
	return(list(fit1, fit0, snep_design, pvalsGLM))
1059
}
1060

    
1061

    
1062

    
1063

    
1064

    
1065

    
1066

    
1067

    
1068
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1069
### This function uses
1070
marker, ##<< The marker involved.
1071
combi, ##<< The strain combination involved.
1072
form, ##<< the nuc form involved.
1073
all_samples, ##<< Global list of samples.
1074
config=NULL ##<< GLOBAL config variable
1075
) {
1076
  # PRETREAT
1077
  d = get_design(marker, combi, all_samples)
1078
  reads = get_all_reads(marker, combi, form, config=config)
1079
  # RUN ANALYSE
1080
  tmp_analyse = analyse_design(d, reads)
1081
  # RESULTS
1082
	fit1 = tmp_analyse[[1]]
1083
	fit0 = tmp_analyse[[2]]
1084
  k = names(fit1)
1085
  reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1086
  reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1087
  reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1088
	reads$pvalsGLM = signif(tmp_analyse[[4]], 5)
1089
	snep_design = tmp_analyse[[3]]
1090
  print(snep_design)
1091
	fdr = 0.0001
1092
	thres = FDR(reads$pvalsGLM, fdr)
1093
	reads$snep_index = reads$pvalsGLM < thres
1094
	print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", fdr*100,"% of FDR.", sep = ""))
1095
  return(reads)
1096
  },  ex=function(){
1097
    marker = "H3K4me1"
1098
    combi = c("BY", "YJM")
1099
    form = "wpfuzzy" # "wp" | "fuzzy" | "wpfuzzy"
1100
    # foo = get_sneps(marker, combi, form)
1101
    # foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
1102
})
1103

    
1104

    
1105

    
1106

    
1107

    
1108

    
1109

    
1110

    
1111

    
1112

    
1113

    
1114

    
1115

    
1116

    
1117

    
1118

    
1119

    
1120

    
1121

    
1122

    
1123

    
1124

    
1125

    
1126

    
1127
ROM2ARAB = function(# Roman to Arabic pair list.
1128
### Util to convert Roman to Arabic
1129
){list(
1130
  "I" = 1,
1131
  "II" = 2,
1132
  "III" = 3,
1133
  "IV" = 4,
1134
  "V" = 5,
1135
  "VI" = 6,
1136
  "VII" = 7,
1137
  "VIII" = 8,
1138
  "IX" = 9,
1139
  "X" = 10,
1140
  "XI" = 11,
1141
  "XII" = 12,
1142
  "XIII" = 13,
1143
  "XIV" = 14,
1144
  "XV" = 15,
1145
  "XVI" = 16,
1146
  "XVII" = 17,
1147
  "XVIII" = 18,
1148
  "XIX" = 19,
1149
  "XX" = 20
1150
)}
1151

    
1152
switch_pairlist = structure(function(# Switch a pairlist
1153
### Take a pairlist key:value and return the switched pairlist value:key.
1154
l ##<< The pairlist to switch.
1155
) {
1156
	ret = list()
1157
	for (name in names(l)) {
1158
		ret[[as.character(l[[name]])]] = name
1159
	}
1160
	ret
1161
### The switched pairlist.
1162
}, ex=function(){
1163
	l = list(key1 = "value1", key2 = "value2")
1164
	print(switch_pairlist(l))
1165
})
1166

    
1167
ARAB2ROM = function(# Arabic to Roman pair list.
1168
### Util to convert Arabicto Roman
1169
){switch_pairlist(ROM2ARAB())}
1170

    
1171

    
1172

    
1173

    
1174
c2c_extraction = function(# Extract a sub part of the corresponding c2c file
1175
### This fonction allow to acces to a specific part of the c2c file.
1176
strain1, ##<< the key strain
1177
strain2, ##<< the target strain
1178
chr=NULL, ##<< if defined, the c2c will filtered according to the chromosome value
1179
lower_bound=NULL, ##<< if defined, the c2c will filtered for part of the genome upper than lower_bound
1180
upper_bound=NULL, ##<< if defined, the c2c will filtered for part of the genome lower than upper_bound
1181
config=NULL##<<  GLOBAL config variable
1182
) {
1183
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1184
	# Launch c2c file
1185
	if (reverse) {
1186
		c2c_filename = config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]]
1187
	} else {
1188
		c2c_filename = config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]]
1189
	}
1190
	c2c = get_content(c2c_filename, "table", stringsAsFactors=FALSE)
1191
  # Filtering unagapped
1192
  c2c = c2c[c2c$V6=="-",]
1193
	# Reverse
1194
	if (reverse) {
1195
		tmp_col = c2c$V1
1196
		c2c$V1 = c2c$V7
1197
		c2c$V7 = tmp_col
1198
		tmp_col = c2c$V2
1199
		c2c$V2 = c2c$V9
1200
		c2c$V9 = tmp_col
1201
		tmp_col = c2c$V3
1202
		c2c$V3 = c2c$V10
1203
		c2c$V10 = tmp_col
1204
	}
1205
  if (!is.null(chr)) {
1206
  	if (strain1 == "BY") {
1207
  		chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="")
1208
  	} else if (strain1 == "RM") {
1209
  	  chro_1 = paste("supercontig_1.",chr,sep="")
1210
  	} else if (strain1 == "YJM") {
1211
  	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]]
1212
  	}
1213
  	c2c = c2c[c2c$V1 == chro_1,]
1214
    if (!is.null(lower_bound)) {
1215
      if (length(c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3,c("V2", "V3") ] = lower_bound}
1216
      if (length(c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2,c("V2", "V3") ] = lower_bound}      
1217
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1218
    }
1219
    if (!is.null(upper_bound)) {
1220
      if (length(c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, c("V2", "V3")] = upper_bound}
1221
      if (length(c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, c("V2", "V3")] = upper_bound}      
1222
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1223
    }
1224
  }
1225
  return(c2c)
1226
# It retruns the appropriate c2c file part.
1227
}
1228

    
1229
translate_roi = structure(function(# Translate coords of a genome region.
1230
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1231
roi, ##<< Original genome region of interest.
1232
strain2, ##<< The strain in wich you want the genome region of interest.
1233
config=NULL, ##<< GLOBAL config variable
1234
big_roi=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1235
) {
1236
	strain1 = roi$strain_ref
1237
	if (strain1 == strain2) {
1238
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1
1239
		return(roi)
1240
	}
1241

    
1242
	# Extract c2c file
1243
	if (!is.null(big_roi)) {
1244
  	# Dealing with big_roi
1245
		if (roi$strain_ref != big_roi$strain_ref) {
1246
      big_roi = translate_roi(big_roi, roi$strain_ref, config=config)
1247
    }
1248
    if (big_roi$end < big_roi$begin) {
1249
      tmp_var = big_roi$begin
1250
      big_roi$begin = big_roi$end
1251
      big_roi$end = tmp_var
1252
      big_roi$length = big_roi$end - big_roi$begin + 1
1253
    }
1254
    if (big_roi$chr!=roi$chr | roi$end > big_roi$end | roi$end < big_roi$begin | roi$begin > big_roi$end | roi$begin < big_roi$begin) {
1255
      print("WARNING! Trying to translate a roi not included in a big_roi.")
1256
      return(NULL)
1257
    }
1258
  	c2c = c2c_extraction(strain1, strain2, big_roi$chr, big_roi$begin, big_roi$end, config=config)
1259
  } else {
1260
    # No big_roi
1261
  	c2c = c2c_extraction(strain1, strain2, roi$chr, config=config)    
1262
  }
1263
  
1264
  #	Convert initial roi$chr into c2c format
1265
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1266
	begin_1 = roi$begin
1267
  end_1 = roi$end
1268
  if (reverse) {
1269
  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1270
    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1271
	} else {
1272
		tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1273
	  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1274
	}
1275

    
1276
	# Never happend conditions ...
1277
	{
1278
		if (length(tmptransfostart$V8) == 0) {
1279
			# begin_1 is between to lines: shift begin_1 to the start of 2nd line.
1280
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1281
  			tmp_c2c = c2c[c2c$V2>=begin_1,]
1282
  			begin_1 = min(tmp_c2c$V2)
1283
      } else {
1284
  			tmp_c2c = c2c[c2c$V3>=begin_1,]
1285
  			begin_1 = min(tmp_c2c$V3)
1286
      }
1287
			if (reverse) {
1288
		  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1289
			} else {
1290
				tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1291
			}
1292
			if (length(tmptransfostart$V8) == 0) {
1293
				if (!is.null(big_roi)) {
1294
					return(NULL)
1295
					tmptransfostart = c2c[c2c$V3>=big_roi$begin & c2c$V2<=big_roi$begin,]
1296
				} else {
1297
					print(tmptransfostart)
1298
					print(tmptransfostop)
1299
					stop("Never happend condition 1.")
1300
				}
1301
			}
1302
		}
1303
		if (length(tmptransfostop$V8) == 0) {
1304
			# end_1 is between to lines: shift end_1 to the end of 2nd line.
1305
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1306
  			tmp_c2c = c2c[c2c$V3<=end_1,]
1307
  			end_1 = max(tmp_c2c$V3)
1308
      } else {
1309
  			tmp_c2c = c2c[c2c$V2<=end_1,]
1310
  			end_1 = max(tmp_c2c$V2)
1311
      }
1312
			if (reverse) {
1313
		    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1314
			} else {
1315
			  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1316
			}
1317
			if (length(tmptransfostop$V8) == 0) {
1318
				if (!is.null(big_roi)) {
1319
					return(NULL)
1320
				  tmptransfostop = c2c[c2c$V3>=big_roi$end & c2c$V2<=big_roi$end,]
1321
				} else {
1322
					print(tmptransfostart)
1323
					print(tmptransfostop)
1324
					stop("Never happend condition 2.")
1325
				}
1326
			}
1327
		}
1328
		if (length(tmptransfostart$V8) != 1) {
1329
			# print("many start")
1330
			tmptransfostart = tmptransfostart[tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,]
1331
			if (length(tmptransfostart$V8) != 1) {
1332
				print(tmptransfostart)
1333
				print(tmptransfostop)
1334
  			stop("Never happend condition 3.")
1335
			}
1336
		}
1337
		if (length(tmptransfostop$V8) != 1) {
1338
			# print("many stop")
1339
		  tmptransfostop = tmptransfostop[tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,]
1340
			if (length(tmptransfostop$V8) != 1) {
1341
				print(tmptransfostart)
1342
				print(tmptransfostop)
1343
  			stop("Never happend condition 4.")
1344
			}
1345
		}
1346
		if (tmptransfostart$V7 != tmptransfostop$V7) {
1347
			print(tmptransfostart)
1348
			print(tmptransfostop)
1349
 			stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.")
1350
		}
1351
	}
1352
  
1353
  # Deal with strand
1354
  if (tmptransfostart$V8 == 1) {
1355
    begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2)
1356
    end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2)
1357
  } else {
1358
    begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2)
1359
    end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2)
1360
  }
1361
	# Build returned roi
1362
	roi$strain_ref = strain2
1363
	if (roi$strain_ref == "BY") {
1364
		roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]]
1365
	} else {
1366
		roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]]
1367
	}
1368
  roi$begin = begin_2
1369
  roi$end = end_2
1370
	if (sign(roi$end - roi$begin) == 0) {
1371
		roi$length = 1
1372
	} else {
1373
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1
1374
	}
1375
  return(roi)
1376
}, ex=function(){
1377
	# Define new translate_roi function...
1378
	translate_roi = function(roi, strain2, config) {
1379
		strain1 = roi$strain_ref
1380
		if (strain1 == strain2) {
1381
			return(roi)
1382
		} else {
1383
		  stop("Here is my new translate_roi function...")
1384
		}
1385
	}
1386
	# Binding it by uncomment follwing lines.
1387
	# unlockBinding("translate_roi", as.environment("package:nm"))
1388
	# unlockBinding("translate_roi", getNamespace("nm"))
1389
	# assign("translate_roi", translate_roi, "package:nm")
1390
	# assign("translate_roi", translate_roi, getNamespace("nm"))
1391
	# lockBinding("translate_roi", getNamespace("nm"))
1392
	# lockBinding("translate_roi", as.environment("package:nm"))
1393
})
1394

    
1395

    
1396
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
1397
### CURs are regions that can be aligned between the genomes
1398
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR
1399
min_cur_width = 4000, ##<< The minimum width of a CUR
1400
config = NULL ##<< GLOBAL config variable
1401
) {
1402

    
1403
  check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) {
1404
    c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1405
    check_homogeneity(c2c)
1406
  	if (length(c2c[,1]) == 0 ) {
1407
      stop("WARNING! checking overlapping for a region corresponding to an empty c2c.")
1408
    } else {
1409
  		lower_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; min(l$V2, l$V3)})
1410
  		upper_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; max(l$V2, l$V3)})
1411
  		tmp_index = order(lower_bounds)
1412
      lower_bounds = lower_bounds[tmp_index]
1413
      upper_bounds = upper_bounds[tmp_index]
1414
      tmp_diff = lower_bounds[-1] - upper_bounds[-length(upper_bounds)]
1415
      ov_index = which(tmp_diff < 0)
1416
      if(length(ov_index < 0) !=0 ) {
1417
        ov_index = ov_index[1]
1418
        print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ", c2c[1,]$V1, sep=""))
1419
        c2c_corrupted = c2c[tmp_index,][c(ov_index, ov_index + 1),]
1420
        print(c2c_corrupted)
1421
        return(list(lower_bounds[ov_index+1] - 1, upper_bounds[ov_index] + 1))
1422
      }
1423
      return(NULL)
1424
    }
1425
  }
1426

    
1427
  check_homogeneity = function(sub_c2c) {
1428
    tmp_signs = sign(sub_c2c$V2 - sub_c2c$V3)
1429
    tmp_signs = tmp_signs[tmp_signs != 0]
1430
  	if (sum(tmp_signs[1]  != tmp_signs)) {
1431
  		print(paste("*************** ERROR, non homogenous region (sign)! ********************"))
1432
      print(tmp_signs)
1433
  	}
1434
    tmp_signs2 = sign(sub_c2c$V9 - sub_c2c$V10)
1435
    tmp_signs2 = tmp_signs2[tmp_signs2 != 0]
1436
  	if (sum(tmp_signs2[1]  != tmp_signs2)) {
1437
  		print(paste("*************** ERROR, non homogenous region (sign2)! ********************"))
1438
      print(tmp_signs2)
1439
  	}
1440
  	if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) {
1441
  		print("*************** ERROR, non homogenous region chrs or V8! ********************")
1442
  	}
1443
  }
1444

    
1445
  test_and_squeeze_rois = function(foo, config=NULL) {
1446
    is_it_ok = function(list1, list2) {
1447
      bar = cbind(list1$begin, list2$begin, abs(list1$begin - list2$begin), list1$end, list2$end, abs(list1$end - list2$end), list1$length, list2$length, abs(list1$length - list2$length))
1448
      ok = length(bar[bar[,3] != 0 | bar[,6] != 0, ]) == 0
1449
      if (!ok) {
1450
        print(bar[bar[,3] != 0 | bar[,6] != 0, ])
1451
      }
1452
      return (ok)
1453
    }
1454
    squeeze_rois = function(list1, list2) {
1455
      rois = apply(t(1:nrow(list1)), 2, function(i){
1456
        roi = list1[i,]
1457
        roi2 = list2[i,]
1458
        roi$begin = max(roi$begin, roi2$begin)
1459
        roi$end = min(roi$end, roi2$end)
1460
        roi$length =  roi$end - roi$begin + 1
1461
        return(roi)
1462
      })
1463
      return(do.call(rbind, rois))
1464
    }
1465
    # foo_orig = compute_inter_all_strain_curs2(config=config)
1466
    # foo = foo_orig
1467
    STOP = FALSE
1468
    nb_round = 0
1469
    while(!STOP) {
1470
      nb_round = nb_round + 1
1471
      print(paste("2-2 round #", nb_round, sep=""))
1472
      fooby = translate_rois(foo, "BY", config=config)
1473
      fooyjm = translate_rois(foo, "YJM", config=config)
1474
      fooyjmby = translate_rois(fooyjm, "BY", config=config)
1475
      if (!is_it_ok(fooby, fooyjmby)) {
1476
        print("case 1")
1477
        foo = squeeze_rois(fooby, fooyjmby)
1478
    		next
1479
      }
1480
      foorm = translate_rois(foo, "RM", config=config)
1481
      foormby = translate_rois(foorm, "BY", config=config)
1482
      if (!is_it_ok(fooby, foormby)) {
1483
        print("case 2")
1484
        foo = squeeze_rois(fooby, foormby)
1485
    		next
1486
      }
1487
      fooyjmrm = translate_rois(fooyjm, "RM", config=config)
1488
      fooyjmrmyjm = translate_rois(fooyjmrm, "YJM", config=config)
1489
      if (!is_it_ok(fooyjm, fooyjmrmyjm)) {
1490
        print("case 3")
1491
        foo = squeeze_rois(fooyjm, fooyjmrmyjm)
1492
        next
1493
      }
1494
      foormyjm = translate_rois(foorm, "YJM", config=config)
1495
      foormyjmrm = translate_rois(foormyjm, "RM", config=config)
1496
      if (!is_it_ok(foorm, foormyjmrm)) {
1497
        print("case 4")
1498
        foo = squeeze_rois(foorm, foormyjmrm)
1499
        next
1500
      }
1501
      foo = translate_rois(foo, "BY", config=config)
1502
      STOP = TRUE
1503
    }
1504
    STOP = FALSE
1505
    nb_round = 0
1506
    while(!STOP) {
1507
      nb_round = nb_round + 1
1508
      print(paste("3-3 round #", nb_round, sep=""))
1509
      fooby = translate_rois(foo, "BY", config=config)
1510
      foobyrm = translate_rois(fooby, "RM", config=config)
1511
      foobyrmyjm = translate_rois(foobyrm, "YJM", config=config)
1512
      foobyrmyjmby = translate_rois(foobyrmyjm, "BY", config=config)
1513
      if (!is_it_ok(fooby, foobyrmyjmby)) {
1514
        print("case 1")
1515
        foo = squeeze_rois(fooby, foobyrmyjmby)
1516
      }
1517
      fooby = translate_rois(foo, "BY", config=config)
1518
      foobyyjm = translate_rois(fooby, "YJM", config=config)
1519
      foobyyjmrm = translate_rois(foobyyjm, "RM", config=config)
1520
      foobyyjmrmby = translate_rois(foobyyjmrm, "BY", config=config)
1521
      if (!is_it_ok(fooby, foobyyjmrmby)) {
1522
        print("case 2")
1523
        foo = squeeze_rois(fooby, foobyyjmrmby)
1524
        next
1525
      }
1526
      foo = translate_rois(foo, "BY", config=config)
1527
      STOP = TRUE
1528
    }
1529
    print("end")
1530
    return(foo)
1531
  }
1532

    
1533
  get_inter_strain_rois = function(strain1, strain2, diff_allowed = 30, min_cur_width = 200, config=NULL) {
1534
    c2c = c2c_extraction(strain1, strain2, config=config)
1535
    # computing diffs
1536
    diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)]
1537
    diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)]
1538
    # Filtering
1539
  	indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed)
1540
  	indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1))
1541
    rois = apply(t(1:length(indexes_start)), 2, function(i) {
1542
      if ( i %% 20 == 1) print(paste(i, "/", length(indexes_start)))
1543
      returned_rois = NULL
1544
  		start = indexes_start[i]
1545
  		stop = indexes_stop[i]
1546
  		sub_c2c = c2c[start:stop,]
1547
      check_homogeneity(sub_c2c)
1548
  		if (strain1 == "BY") {
1549
  			chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]]
1550
  		} else {
1551
  			chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]]
1552
  		}
1553
  		roi = list(chr=chr, begin=min(c(sub_c2c$V2,sub_c2c$V3)), end=max(c(sub_c2c$V2,sub_c2c$V3)), strain_ref=strain1)
1554
  		roi$length = roi$end - roi$begin + 1
1555
  		if (roi$length >= min_cur_width) {
1556
  			lower_bound = roi$begin
1557
  			upper_bound = roi$end
1558
        check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1559
        while(!is.null(check)) {
1560
          # print(check)
1561
      		roi1 = roi
1562
          roi1$end = check[[1]]
1563
      		roi1$length = roi1$end - roi1$begin + 1
1564
      		if (roi1$length >= min_cur_width) {
1565
            returned_rois = dfadd(returned_rois,roi1)
1566
          }
1567
          roi$begin = check[[2]]
1568
      		roi$length = roi$end - roi$begin + 1
1569
      		if (roi$length >= min_cur_width) {
1570
      			lower_bound = min(roi$begin, roi$end)
1571
      			upper_bound = max(roi$begin, roi$end)
1572
            check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1573
          } else {
1574
            check = NULL
1575
            roi = NULL
1576
          }
1577
        }
1578
        returned_rois = dfadd(returned_rois,roi)
1579
  	  }
1580
    })
1581
    rois = do.call(rbind,rois)
1582
    rois = rois[order(as.numeric(rois$chr), rois$begin), ]
1583
  	return(rois)
1584
  }
1585

    
1586
  translate_rois = function(rois, target_strain, config) {
1587
    tr_rois = apply(t(1:nrow(rois)), 2, function(i){
1588
      roi = rois[i,]
1589
      tr_roi = translate_roi(roi, target_strain, config=config)  
1590
      tmp_begin = min(tr_roi$begin, tr_roi$end)
1591
      tmp_end = max(tr_roi$begin, tr_roi$end)
1592
      tr_roi$begin = tmp_begin
1593
      tr_roi$end = tmp_end
1594
      tr_roi$length =  tr_roi$end - tr_roi$begin + 1
1595
      return(tr_roi)
1596
    })
1597
    tr_rois = do.call(rbind, tr_rois)
1598
    return(tr_rois)
1599
  }
1600

    
1601
  combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM"))
1602
  rois = list()
1603
  for (combi in combis) {
1604
    strain1 = combi[1]
1605
    strain2 = combi[2]
1606
    print(paste(strain1, strain2))
1607
    rois_fwd = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1608
    strain1 = combi[2]
1609
    strain2 = combi[1]
1610
    print(paste(strain1, strain2))
1611
    rois_rev = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1612
    tr_rois_rev = translate_rois(rois_rev, combi[1], config)      
1613
    region1 = rois_fwd
1614
    region2 = tr_rois_rev
1615
    rois[[paste(combi[1], combi[2], sep="_")]] = intersect_region(rois_fwd, tr_rois_rev)
1616
  }
1617
  reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]])
1618
  reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ]
1619
  tr_reducted_1_rois = translate_rois(reducted_1_rois, "RM", config)  
1620
  reducted_2_rois = intersect_region(tr_reducted_1_rois, rois[["RM_YJM"]])
1621
  reducted_2_rois = reducted_2_rois[reducted_2_rois$length >= min_cur_width, ]
1622
  reducted_rois = translate_rois(reducted_2_rois, "BY", config)  
1623
  reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ]
1624
  squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config)
1625
  return (squeezed_rois)
1626
}
1627

    
1628
intersect_region = function(# Returns the intersection of 2 list on regions.
1629
### This function...
1630
region1, ##<< Original regions.
1631
region2 ##<< Regions to intersect.
1632
) {
1633
  intersection = apply(t(1:nrow(region1)), 2, function(i) {
1634
    roi1 = region1[i, ]
1635
    sub_regions2 = region2[region2$chr == roi1$chr, ]
1636
    sub_regions2 = sub_regions2[roi1$begin <= sub_regions2$begin & sub_regions2$begin <= roi1$end |
1637
                                roi1$begin <= sub_regions2$end & sub_regions2$end <= roi1$end |
1638
                                sub_regions2$begin < roi1$begin  & roi1$end < sub_regions2$end
1639
                                , ]
1640
    if (nrow(sub_regions2) == 0) {
1641
      print("removing a region")
1642
      return(NULL)
1643
    } else if (nrow(sub_regions2) > 1) {
1644
      print("more than one region in intersect_region")          
1645
      return(do.call(rbind, apply(t(1:nrow(sub_regions2)), 2, function(i) {intersect_region(roi1, sub_regions2[i,])})))
1646
    } else {
1647
      roi2 = sub_regions2[1,]
1648
      if (roi1$begin < roi2$begin) {
1649
        print("not the same begin")
1650
        roi1$begin = roi2$begin
1651
        roi1$length =  roi1$end - roi1$begin + 1
1652
      }
1653
      if (roi1$end > roi2$end) {
1654
        print("not the same end")
1655
        roi1$end = roi2$end
1656
        roi1$length =  roi1$end - roi1$begin + 1
1657
      }
1658
      return(roi1)
1659
    }         
1660
  })
1661
  return(do.call(rbind,intersection))
1662
}
1663

    
1664
build_replicates = structure(function(# Stage replicates data
1665
### This function loads in memory data corresponding to the given experiments.
1666
expe, ##<< a list of vector corresponding to vector of replicates.
1667
roi, ##<< the region that we are interested in.
1668
only_fetch=FALSE, ##<< filter or not inputs.
1669
get_genome=FALSE,##<< Load or not corresponding genome.
1670
all_samples, ##<< Global list of samples.
1671
config=NULL ##<< GLOBAL config variable.
1672
) {
1673
  build_samples = function(samples_ids, roi, only_fetch=FALSE, get_genome=TRUE, get_ouputs=TRUE, all_samples) {
1674
  	samples=list()
1675
  	for (i in samples_ids) {
1676
  		sample = as.list(all_samples[all_samples$id==i,])
1677
      sample$orig_roi = roi
1678
      sample$roi = translate_roi(roi, sample$strain, config = config)
1679
  		if (get_genome) {
1680
  			# Get Genome
1681
  			fasta_ref_filename = config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]]
1682
  			sample$roi$genome = get_content(fasta_ref_filename, "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1683
  		}
1684
  		# Get inputs
1685
  		sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1686
  		sample$inputs = get_content(sample_inputs_filename, "table", stringsAsFactors=FALSE)
1687
  		sample$total_reads = sum(sample$inputs[,4])
1688
  		if (!only_fetch) {
1689
  		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300, filter_for_coverage=TRUE)
1690
  	  }
1691
  	  # Get TF outputs for Mnase_Seq samples
1692
  		if (sample$marker == "Mnase_Seq" & get_ouputs) {
1693
  			sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")
1694
  			sample$outputs = get_content(sample_outputs_filename, "table", header=TRUE, sep="\t")
1695
  			if (!only_fetch) {
1696
  	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1697
    		}
1698
  		}
1699
  		samples[[length(samples) + 1]] = sample
1700
  	}
1701
  	return(samples)
1702
  }
1703
	replicates = list()
1704
	for(samples_ids in expe) {
1705
		samples = build_samples(samples_ids, roi, only_fetch=only_fetch, get_genome=get_genome, all_samples=all_samples)
1706
		replicates[[length(replicates) + 1]] = samples
1707
	}
1708
	return(replicates)
1709
  }, ex = function() {
1710
    # library(rjson)
1711
    # library(nucleominer)
1712
    #
1713
    # # Read config file
1714
    # json_conf_file = "nucleo_miner_config.json"
1715
    # config = fromJSON(paste(readLines(json_conf_file), collapse=""))
1716
    # # Read sample file
1717
    # all_samples = get_content(config$CSV_SAMPLE_FILE, "cvs", sep=";", head=TRUE, stringsAsFactors=FALSE)
1718
    # # here are the sample ids in a list
1719
    # expes = list(c(1))
1720
    # # here is the region that we wnt to see the coverage
1721
    # cur = list(chr="8", begin=472000, end=474000, strain_ref="BY")
1722
    # # it displays the corverage
1723
    # replicates = build_replicates(expes, cur, all_samples=all_samples, config=config)
1724
    # out = watch_samples(replicates, config$READ_LENGTH,
1725
    #       plot_coverage = TRUE,
1726
    #       plot_squared_reads = FALSE,
1727
    #       plot_ref_genome = FALSE,
1728
    #       plot_arrow_raw_reads = FALSE,
1729
    #       plot_arrow_nuc_reads = FALSE,
1730
    #       plot_gaussian_reads = FALSE,
1731
    #       plot_gaussian_unified_reads = FALSE,
1732
    #       plot_ellipse_nucs = FALSE,
1733
    #       plot_wp_nucs = FALSE,
1734
    #       plot_wp_nuc_model = FALSE,
1735
    #       plot_common_nucs = FALSE,
1736
    #       height = 50)
1737
  })
1738

    
1739

    
1740

    
1741

    
1742

    
1743

    
1744

    
1745

    
1746

    
1747

    
1748

    
1749

    
1750

    
1751

    
1752

    
1753

    
1754

    
1755

    
1756

    
1757

    
1758

    
1759

    
1760

    
1761

    
1762

    
1763

    
1764

    
1765

    
1766

    
1767

    
1768

    
1769

    
1770

    
1771

    
1772

    
1773

    
1774

    
1775

    
1776

    
1777

    
1778

    
1779

    
1780

    
1781

    
1782

    
1783

    
1784

    
1785

    
1786

    
1787

    
1788

    
1789

    
1790

    
1791

    
1792

    
1793

    
1794

    
1795

    
1796

    
1797

    
1798

    
1799

    
1800

    
1801

    
1802

    
1803

    
1804

    
1805

    
1806

    
1807

    
1808

    
1809

    
1810

    
1811

    
1812

    
1813

    
1814

    
1815

    
1816

    
1817

    
1818

    
1819

    
1820

    
1821
watch_samples = function(# Watching analysis of samples
1822
### This function allows to view analysis for a particuler region of the genome.
1823
replicates, ##<< replicates under the form...
1824
read_length, ##<< length of the reads
1825
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome.
1826
plot_arrow_raw_reads = TRUE,  ##<< Plot (or not) arrows for raw reads.
1827
plot_arrow_nuc_reads = TRUE,  ##<< Plot (or not) arrows for reads aasiocied to a nucleosome.
1828
plot_squared_reads = TRUE,  ##<< Plot (or not) reads in the square fashion.
1829
plot_coverage = FALSE,  ##<< Plot (or not) reads in the covergae fashion. fashion.
1830
plot_gaussian_reads = TRUE,  ##<< Plot (or not) gaussian model of a F anf R reads.
1831
plot_gaussian_unified_reads = TRUE,  ##<< Plot (or not) gaussian model of a nuc.
1832
plot_ellipse_nucs = TRUE,  ##<< Plot (or not) ellipse for a nuc.
1833
change_col = TRUE, ##<< Change the color of each nucleosome.
1834
plot_wp_nucs = TRUE,  ##<< Plot (or not) cluster of nucs
1835
plot_fuzzy_nucs = TRUE,  ##<< Plot (or not) cluster of fuzzy
1836
plot_wp_nuc_model = TRUE,  ##<< Plot (or not) gaussian model for a cluster of nucs
1837
plot_common_nucs = FALSE,  ##<< Plot (or not) aligned reads.
1838
plot_common_unrs = FALSE,  ##<< Plot (or not) unaligned nucleosomal refgions (UNRs).
1839
plot_wp_nucs_4_nonmnase = FALSE,  ##<< Plot (or not) clusters for non inputs samples.
1840
plot_chain = FALSE,  ##<< Plot (or not) clusterised nuceosomes between mnase samples.
1841
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed.
1842
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed.
1843
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis.
1844
config=NULL ##<< GLOBAL config variable
1845
){
1846
  returned_list = list()
1847
  # Computing global display parameters
1848
  if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) {
1849
	  x_min_glo = replicates[[1]][[1]]$roi[["begin"]]
1850
	  x_max_glo = replicates[[1]][[1]]$roi[["end"]]
1851
  } else {
1852
	  x_min_glo = - replicates[[1]][[1]]$roi[["begin"]]
1853
	  x_max_glo = - replicates[[1]][[1]]$roi[["end"]]
1854
  }
1855
	base_glo = 0
1856
	nb_rank_glo = 0
1857
  for (samples in replicates) {
1858
  	nb_rank_glo = nb_rank_glo + length(samples)
1859
  }
1860
	ylim_glo = c(base_glo, base_glo + height * nb_rank_glo)
1861
	y_min_glo = min(ylim_glo)
1862
	y_max_glo = max(ylim_glo)
1863
  delta_y_glo = y_max_glo - y_min_glo
1864
  # Plot main frame
1865
  plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab="#reads (per million reads)", xlab=paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr), main="NucleoMiner2" )
1866
  axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2))
1867
  # Go
1868
	replicates_wp_nucs = list()
1869
  wp_maps = list()
1870
  fuzzy_maps = list()
1871
  for (replicate_rank in 1:length(replicates)) {
1872
		# Computing replicate parameters
1873
		nb_rank = length(samples)
1874
		base = (replicate_rank-1) * height * nb_rank
1875
		ylim = c(base, base + height * nb_rank)
1876
		y_min = min(ylim)
1877
		y_max = max(ylim)
1878
	  delta_y = y_max - y_min
1879
		samples = replicates[[replicate_rank]]
1880
		for (sample_rank in 1:length(samples)) {
1881
			# computing sample parameters
1882
			sample = samples[[sample_rank]]
1883
			y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank
1884
			text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep=""))
1885

    
1886
		  if (sample$roi[["begin"]] < sample$roi[["end"]]) {
1887
			  x_min = sample$roi[["begin"]]
1888
			  x_max = sample$roi[["end"]]
1889
		  } else {
1890
			  x_min = - sample$roi[["begin"]]
1891
			  x_max = - sample$roi[["end"]]
1892
		  }
1893
			shift = x_min_glo - x_min
1894
	    # Plot Genome seq
1895
			if (plot_ref_genome) {
1896
		  	text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier")
1897
		  }
1898
			# Plot reads
1899
			reads = sample$inputs
1900
			signs = sign_from_strand(reads[,3])
1901
			if (plot_arrow_raw_reads) {
1902
				arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev,
1903
				col=1, length=0.15/nb_rank)
1904
			}
1905
	    if (plot_squared_reads) {
1906
        # require(plotrix)
1907
				rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift,  y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0)
1908
			}
1909
	    if (plot_coverage) {
1910
        if (length(reads[,1]) != 0) {
1911
          step_h = sign(x_min) * signs * reads[,4]
1912
          step_b = sign(x_min) * reads[,2] + shift
1913
          step_e = sign(x_min) * (reads[,2] + signs * 150) + shift
1914
          steps_x = min(step_b, step_e):max(step_b, step_e)
1915
          steps_y = rep(0, length(steps_x))
1916
          for (i in 1:length(step_h)) {
1917
            steps_y[which(steps_x==min(step_b[i], step_e[i]))] =  steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i])
1918
            steps_y[which(steps_x==max(step_b[i], step_e[i]))] =  steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i])
1919
          }
1920
          tmp_index = which(steps_y != 0)
1921
          steps_x = steps_x[tmp_index]
1922
          steps_y = steps_y[tmp_index]
1923
          tmp_current_level = 0
1924
          for (i in 1:length(steps_y)) {
1925
            steps_y[i] = tmp_current_level + steps_y[i]
1926
            tmp_current_level = steps_y[i]
1927
          }
1928
          steps_y = c(0, steps_y)
1929
          steps_y = steps_y * 1000000/sample$total_reads
1930
        } else {
1931
          steps_y = c(0, 0, 0)
1932
          steps_x = c(x_min, x_max)
1933
        }
1934
        # print(steps_x)
1935
        # print(steps_y)
1936
        lines(stepfun(steps_x, steps_y + y_lev), pch="")
1937
        abline(y_lev,0)
1938
        returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y)
1939
			}
1940
			# Plot nucs
1941
	    if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) {
1942
				nucs = sample$outputs
1943
				if (length(nucs$center) > 0) {
1944
					col = 1
1945
		      for (i in 1:length(nucs$center)) {
1946
            if (change_col) {
1947
  						col = col + 1
1948
            }
1949
		        nuc = nucs[i,]
1950
						involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width)
1951
				  	involved_signs = apply(t(involved_reads[,3]), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
1952
						total_involved_reads = sum(involved_reads[,4])
1953
						if (plot_arrow_nuc_reads ) {
1954
							arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev,
1955
							col=col, length=0.15/nb_rank)
1956
						}
1957
	          if (plot_gaussian_reads | plot_gaussian_unified_reads) {
1958
  						flatted_reads = flat_reads(involved_reads, nuc$width)
1959
	  					delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
1960
		  			}
1961
	          if (plot_gaussian_reads ) {
1962
							flatted_reads = flat_reads(involved_reads, nuc$width)
1963
							delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
1964
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) * height/5 + y_lev, col=col)
1965
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) * height/5 + y_lev, col=col)
1966
	          }
1967
	          if (plot_gaussian_unified_reads ) {
1968
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) * height/5 + y_lev, col=col, lty=2)
1969
	          }
1970
	          if (plot_ellipse_nucs) {
1971
				      # require(plotrix)
1972
	  	 				draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width * height/5, border=col)
1973
						}
1974
		      }
1975
		    } else {
1976
		      print("WARNING! No nucs to print.")
1977
		    }
1978
			}
1979
	  }
1980

    
1981
	  # Plot wp nucs
1982
		if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs | plot_chain)) {
1983
			if (samples[[1]]$marker == "Mnase_Seq") {
1984
				if (is.null(aggregated_intra_strain_nucs)) {
1985
	  			wp_nucs = aggregate_intra_strain_nucs(samples)[[1]]
1986
				} else {
1987
					wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]]
1988
				}
1989
		  } else {
1990
  			wp_nucs = replicates_wp_nucs[[replicate_rank-2]]
1991
		  }
1992
      if (plot_chain) {
1993
        tf_nucs = lapply(wp_nucs, function(nuc) {
1994
          bar = apply(t(nuc$nucs), 2, function(tmp_nuc){
1995
            tmp_nuc = tmp_nuc[[1]]
1996
            tmp_nuc$inputs = NULL
1997
            tmp_nuc$original_reads = NULL
1998
            tmp_nuc$wp = nuc$wp
1999
            # print(tmp_nuc)
2000
            return(tmp_nuc)
2001
          })
2002
          return(do.call(rbind, bar))
2003
        })
2004
        tf_nucs = data.frame(do.call(rbind, tf_nucs))
2005
        tmp_x = (unlist(tf_nucs$lower_bound) + unlist(tf_nucs$upper_bound)) / 2
2006
        tmp_y =  y_min + (unlist(tf_nucs$track) - 0.5) * delta_y/nb_rank
2007
        tmp_y_prev = tmp_y[-length(tmp_y)]
2008
        tmp_y_next = tmp_y[-1]
2009
        tmp_y_inter = (tmp_y_prev + tmp_y_next) / 2
2010
        tmp_track = unlist(tf_nucs$track)
2011
        tmp_track_prev = tmp_track[-length(tmp_track)]
2012
        tmp_track_next = tmp_track[-1]
2013
        tmp_track_inter = signif(tmp_track_prev - tmp_track_next) * (abs(tmp_track_prev - tmp_track_next) > 1) * 25
2014
        tmp_x_prev = tmp_x[-length(tmp_x)]
2015
        tmp_x_next = tmp_x[-1]
2016
        need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50})
2017
        tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift
2018
        tmp_lod_inter =signif(unlist(tf_nucs$lod_score)[-1], 2)
2019
        new_tmp_x = c()
2020
        new_tmp_y = c()
2021
        index_odd = 1:length(tmp_x) * 2 - 1
2022
        index_even = (1:(length(tmp_x) - 1)) * 2
2023
        new_tmp_x[index_odd] = tmp_x
2024
        new_tmp_y[index_odd] = tmp_y
2025
        new_tmp_x[index_even] = tmp_x_inter
2026
        new_tmp_y[index_even] = tmp_y_inter
2027
        lines(new_tmp_x , new_tmp_y, lw=2)
2028
        points(tmp_x, tmp_y, cex=4, pch=16, col="white")
2029
        points(tmp_x, tmp_y, cex=4, lw=2)
2030
        text(tmp_x, tmp_y, 1:nrow(tf_nucs))
2031
        text(tmp_x_inter, tmp_y_inter, tmp_lod_inter, srt=90, cex=0.9, bg = "yellow")#, col=(tmp_lod_inter < 20) + 2)
2032
      }
2033

    
2034
      if (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs ) {
2035
    		replicates_wp_nucs[[replicate_rank]] = wp_nucs
2036
        strain = samples[[1]]$strain
2037
        wp_maps[[strain]] = flat_aggregated_intra_strain_nucs(wp_nucs, "foo")
2038
        fuzzy_maps[[strain]] = get_intra_strain_fuzzy(wp_maps[[strain]], as.list(samples[[1]]$roi), samples[[1]]$strain, config=config)
2039

    
2040
        if (plot_fuzzy_nucs) {
2041
          fuzzy_map = fuzzy_maps[[strain]]
2042
          rect(sign(x_min) * fuzzy_map$lower_bound + shift, y_min, sign(x_min) * fuzzy_map$upper_bound + shift, y_max, col=adjustcolor(3, alpha.f = 0.1), border=1)        
2043
        } 
2044

    
2045
        if (plot_wp_nucs) {
2046
    			for (wp_nuc in wp_nucs) {
2047
    				if (wp_nuc$wp){
2048
    					rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1)
2049
    					if (plot_wp_nuc_model) {
2050
      					all_original_reads = c()
2051
      					for(initial_nuc in wp_nuc$nucs) {
2052
      						all_original_reads = c(all_original_reads, initial_nuc$original_reads)
2053
      					}
2054
      					delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound
2055
    					  lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) * height/5 + y_min, col=1)
2056
    				  }
2057
  				  }
2058
  				}
2059
  			}
2060
      }
2061
		}
2062
	}
2063

    
2064
	if (plot_common_nucs) {
2065
    if (is.null(aligned_inter_strain_nucs)) {
2066
      aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]]
2067
      aligned_inter_strain_nucs$roi_index = "foo" 
2068
    }
2069

    
2070
    #Plot common wp nucs
2071
    mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list()
2072
    for (replicate_rank in 1:length(replicates)) {
2073
      nb_rank[[replicate_rank]] = length(samples)
2074
      base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]]
2075
      ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]])
2076
      y_min[[replicate_rank]] = min(ylim[[replicate_rank]])
2077
      y_max[[replicate_rank]] = max(ylim[[replicate_rank]])
2078
      delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]]
2079
      mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2
2080

    
2081
      samples = replicates[[replicate_rank]]
2082
      for (sample_rank in 1:length(samples)) {
2083
        sample = samples[[sample_rank]]
2084
        y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]]
2085
        if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2086
          x_min[[replicate_rank]] = sample$roi[["begin"]]
2087
          x_max[[replicate_rank]] = sample$roi[["end"]]
2088
        } else {
2089
          x_min[[replicate_rank]] = - sample$roi[["begin"]]
2090
          x_max[[replicate_rank]] = - sample$roi[["end"]]
2091
        }
2092
        shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]]
2093
      }
2094
    }
2095
    for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
2096
      inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
2097
      tmp_xs = tmp_ys = c()
2098
      for (replicate_rank in 1:length(replicates)) {
2099
        samples = replicates[[replicate_rank]]
2100
        strain = samples[[1]]$strain
2101
        tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]])
2102
        tmp_ys = c(tmp_ys, mid_y[[replicate_rank]])
2103
      }
2104
      lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]]), cex=dev.size()[1]*200/(x_max[[1]]-x_min[[1]]), pch=19)
2105
    }
2106
    
2107
    if (plot_common_unrs) {
2108
      combi = c(replicates[[1]][[1]]$strain, replicates[[2]][[1]]$strain)
2109
      roi = as.list(samples[[1]]$roi)
2110
      roi_index = "foo"
2111
      common_nuc_results = list()
2112
      common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = aligned_inter_strain_nucs
2113
      unrs = get_unrs(combi, roi, roi_index, wp_maps, fuzzy_maps, common_nuc_results, config = config) 
2114
      rect(sign(x_min[[1]]) * unrs$lower_bound + shift[[1]], y_min[[1]], sign(x_min[[1]]) * unrs$upper_bound + shift[[1]], y_max[[2]], col=adjustcolor(4, alpha.f = 0.3), border=1)        
2115
    }
2116

    
2117
	}
2118
  return(returned_list)
2119
}
2120

    
2121

    
2122

    
2123
get_intra_strain_fuzzy = function(# Compute the fuzzy list for a given strain.
2124
### This function grabs the nucleosomes detxted by template_filter that have been rejected bt aggregate_intra_strain_nucs as well positions.
2125
wp_map, ##<< Well positionned nucleosomes map.
2126
roi, ##<< The region of interest.
2127
strain, ##<< The strain we want to extracvt the fuzzy map.
2128
config=NULL ##<< GLOBAL config variable.
2129
) {
2130
  fuzzy_map = wp_map[wp_map$wp==0, ]
2131
  if (nrow(fuzzy_map) > 0) {
2132
    fuzzy_map = substract_region(fuzzy_map, wp_map[wp_map$wp==1,])
2133
    if (!is.null(fuzzy_map)) {
2134
      fuzzy_map = union_regions(fuzzy_map)
2135
      fuzzy_map = crop_fuzzy(fuzzy_map, roi, strain, config)
2136
    }
2137
  }
2138
  return(fuzzy_map)
2139
}
2140

    
2141

    
2142

    
2143

    
2144

    
2145
get_unrs = function(# Compute the unaligned nucleosomal regions (UNRs).
2146
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting UNR. It will be take into account in the count read part og the pipeline.
2147
combi, ##<< The strain combination to consider.
2148
roi, ##<< The region of interest.
2149
roi_index, ##<< The region of interest index.
2150
wp_maps, ##<< Well positionned nucleosomes maps.
2151
fuzzy_maps, ##<< Fuzzy nucleosomes maps.
2152
common_nuc_results, ##<< Common wp nuc maps
2153
config=NULL ##<< GLOBAL config variable
2154
) {
2155
  # print(roi_index)
2156

    
2157
  tmp_combi_key = paste(combi[1], combi[2], sep="_")
2158
  tmp_common_nucs = common_nuc_results[[tmp_combi_key]]
2159
  tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$roi_index == roi_index, ]
2160

    
2161
  # print(paste("Dealing with unr from", combi[1]))
2162
  tmp_fuzzy = fuzzy_maps[[combi[1]]]
2163
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$roi_index == roi_index, ]
2164
  tmp_wp = wp_maps[[combi[1]]]
2165
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2166
  tmp_wp = tmp_wp[tmp_wp$roi_index == roi_index, ]
2167
  # Let's go!
2168
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2169
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2170
      return(NULL)
2171
    } else {
2172
      return (index_nuc)
2173
    }
2174
  }))
2175
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2176
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2177
  if (length(tmp_unr) != 0) {
2178
    tmp_unr = union_regions(tmp_unr)
2179
  }
2180
  tmp_unr_nucs_1 = tmp_unr
2181
  if (length(tmp_unr_nucs_1[,1]) == 0) {return(NULL)}
2182
  agg_unr_1 = tmp_unr_nucs_1
2183

    
2184
  # print(paste("Dealing with unr from ", combi[2]))
2185
  tmp_fuzzy = fuzzy_maps[[combi[2]]]
2186
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$roi_index == roi_index, ]
2187
  tmp_wp = wp_maps[[combi[2]]]
2188
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2189
  tmp_wp = tmp_wp[tmp_wp$roi_index == roi_index, ]
2190
  # Let's go!
2191
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2192
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2193
      return(NULL)
2194
    } else {
2195
      return (index_nuc)
2196
    }
2197
  }))
2198
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2199
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2200
  if (length(tmp_unr) != 0) {
2201
    tmp_unr = union_regions(tmp_unr)
2202
  }
2203
  tmp_unr_nucs_2 = tmp_unr
2204
  if (length(tmp_unr_nucs_2[,1]) == 0) {return(NULL)}
2205
  agg_unr_2 = crop_fuzzy(tmp_unr_nucs_2, roi, combi[2], config)
2206
  tr_agg_unr_2 = translate_regions(agg_unr_2, combi, roi_index, roi=roi, config=config)
2207
  tr_agg_unr_2 = union_regions(tr_agg_unr_2)
2208

    
2209
  # print("Dealing with unr from both...")
2210
  all_unr = union_regions(rbind(agg_unr_1, tr_agg_unr_2))
2211

    
2212
  # print(paste("Dealing with wp from", combi[1]))
2213
  tmp_wp = wp_maps[[combi[1]]]
2214
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2215
  tmp_wp = tmp_wp[tmp_wp$roi_index == roi_index, ]
2216
  # Let's go!
2217
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2218
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2219
      return (index_nuc)
2220
    } else {
2221
      return(NULL)
2222
    }
2223
  }))
2224
  wp_nucs_1 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2225
  
2226
  # print(paste("Dealing with wp from", combi[2]))
2227
  tmp_wp = wp_maps[[combi[2]]]
2228
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2229
  tmp_wp = tmp_wp[tmp_wp$roi_index == roi_index, ]
2230
  # Let's go!
2231
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2232
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2233
      return (index_nuc)
2234
    } else {
2235
      return(NULL)
2236
    }
2237
  }))
2238
  wp_nucs_2 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2239
  wp_nucs_2 = crop_fuzzy(wp_nucs_2, roi, combi[2], config)
2240
  if (nrow(wp_nucs_2) == 0) {
2241
    tr_wp_nucs_2 = wp_nucs_2
2242
  } else {
2243
    tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, roi_index, roi=roi, config=config)      
2244
  }
2245

    
2246
  # print("Dealing with wp from both...")
2247
  all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2))
2248

    
2249
  # print("Dealing with unr and wp...")
2250
  non_inter_unr = substract_region(all_unr, all_wp)
2251
  non_inter_unr = crop_fuzzy(non_inter_unr, roi, combi[1], config)
2252
  if (is.null(non_inter_unr)) { return(NULL) }
2253
  non_inter_unr$len = non_inter_unr$upper_bound - non_inter_unr$lower_bound
2254
  # non_inter_fuzzy = non_inter_fuzzy[non_inter_fuzzy$len >= min_fuzz_width,]
2255

    
2256
  non_inter_unr$index_nuc = 1:length(non_inter_unr[,1])
2257
  return (non_inter_unr)
2258
}
2259

    
2260

    
2261