Statistiques
| Branche: | Révision :

root / src / R / nucleominer.R @ ec2936ea

Historique | Voir | Annoter | Télécharger (86,57 ko)

1
# get_content = function(# Get content from cached
2
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
3
# obj, ##<< The object that we want its content.
4
# ... ##<< Parameters that will be passed to my_read
5
# ) {
6
#   UseMethod("get_content", obj)
7
#   ### Returns the cached content of the current object.
8
# }
9
#
10
# get_content.default = structure( function(# Get content from cached
11
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
12
# obj, ##<< The object that we want its content.
13
# ... ##<< Parameters that will be passed to my_read
14
# ) {
15
#   if (inherits(try(NM_CACHE,TRUE), "try-error") || is.null(NM_CACHE)) {
16
#     NM_CACHE <<- list()
17
#   }
18
#   if (is.null(NM_CACHE[[obj$filename]])) {
19
#     print(paste("Loading file ",obj$filename, sep=""))
20
#     tmp_content = my_read(obj, ...)
21
#     print("affect it...")
22
#     NM_CACHE[[obj$filename]] <<- tmp_content
23
#     print("done.")
24
#   }
25
#   return(NM_CACHE[[obj$filename]])
26
# ### Returns the cached content of the current object.
27
# }, ex=function(){
28
#   # Create a dataframe
29
#   df = NULL
30
#   df = dfadd(df, list(key1 = "value1", key2 = "value2"))
31
#   df = dfadd(df, list(key1 = "value1'", key2 = "value3'"))
32
#   # Dump it into tmp file
33
#   write.table(df, file="/tmp/tmp.dump.table.tmp")
34
#   # Load it into cache using feature.
35
#   cached_content_object  = list(filename="/tmp/tmp.dump.table.tmp")
36
#   class(cached_content_object) = "table"
37
#   # First time it will be load into cache
38
#   print(get_content(cached_content_object))
39
#   # Second time not
40
#   print(get_content(cached_content_object))
41
#
42
# })
43
#
44
# my_read = function(
45
# ### Abstract my_read function.
46
#   obj, ...) {
47
#   UseMethod("my_read", obj)
48
# }
49
#
50
# my_read.default = function(
51
# ### Default my_read function.
52
#   obj, ...){
53
#   stop(paste("ERROR, my_read is not defined for any Objects (file ", obj$filename," )", sep=""))
54
# }
55
#
56
# my_read.fasta = function(
57
# ### my_read function for fasta files.
58
#   obj, ...){
59
#   # require(seqinr)
60
#   return(read.fasta(obj$filename, ...))
61
# }
62
#
63
# my_read.table = function(
64
# ### my_read function for table files.
65
#   obj, ...){
66
#   if (rev(unlist(strsplit(obj$filename, ".", fixed=TRUE)))[1] == "gz") {
67
#     return(read.table(file=gzfile(obj$filename), ...))
68
#   } else {
69
#     return(read.table(file=obj$filename, ...))
70
#   }
71
# }
72
#
73
# my_read.cvs = function(
74
# ### my_read function for cvs files.
75
#   obj, ...){
76
#   return(read.csv(file=obj$filename, ...))
77
# }
78

    
79

    
80

    
81

    
82

    
83

    
84

    
85

    
86

    
87
FDR = structure(function#  False Discovery Rate
88
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper
89
##author<< Gael Yvert,
90
(
91
x, ##<< A vector x of independent p-values.
92
FDR ##<< The specified FDR.
93
) {
94
  x <- sort(na.omit(x))
95
  N = length(x)
96
  i = 1;
97
  while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR
98
  if (i == 1)
99
    return (NA)
100
  else
101
    return( x[i-1] )
102
### Return the the corresponding cutoff.
103
}, ex=function(){
104
  print("example")
105
})
106

    
107

    
108

    
109
llr_score_nvecs = structure(function # Likelihood ratio
110
### Compute the log likelihood ratio of two or more set of value.
111
(
112
  xs ##<< list of vectors.
113
) {
114
  l = length(xs)
115
  if (l < 1) {
116
    return(NA)
117
  }
118
  if (l == 1) {
119
    return(1)
120
  }
121
  sumllX = 0
122
  for (i in 1:l) {
123
    x = xs[[i]]
124
  	if (length(x) <= 1) {
125
  		return(NA)
126
  	}
127
    meanX = mean(x)
128
    sdX = sd(x)
129
    llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
130
    sumllX = sumllX + llX
131
  }
132
  meanXglo = mean(unlist(xs))
133
  sdXglo = sd(unlist(xs))
134
  llXYZ = sum(log(dnorm(unlist(xs),mean=meanXglo,sd=sdXglo)))
135
  ratio = sumllX - llXYZ
136
  return(ratio)
137
### Returns the log likelihood ratio.
138
}, ex=function(){
139
  # LOD score for 2 set of values
140
  mean1=5; sd1=2; card2 = 250
141
  mean2=6; sd2=3; card1 = 200
142
  x1 = rnorm(card1, mean1, sd1)
143
  x2 = rnorm(card2, mean2, sd2)
144
  min = floor(min(c(x1,x2)))
145
  max = ceiling(max(c(x1,x2)))
146
  hist(c(x1,x2), xlim=c(min, max), breaks=min:max)
147
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
148
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
149
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
150
  llr_score_nvecs(list(x1,x2))
151
 })
152

    
153
dfadd = structure(function# Adding list to a dataframe.
154
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}.
155
	(df, ##<<  A dataframe
156
		l ##<<  A list
157
	) {
158
  if (is.null(df)) {
159
    df = data.frame(l,stringsAsFactors=FALSE)
160
  } else {
161
    df = rbind(df, data.frame(l,stringsAsFactors=FALSE))
162
  }
163
  return(df)
164
### Return the dataframe \emph{df}.
165
}, ex=function(){
166
		## Here dataframe is NULL
167
		print(df)
168
		df = NULL
169

    
170
		# Initialize df
171
		df = dfadd(df, list(key1 = "value1", key2 = "value2"))
172
		print(df)
173

    
174
		# Adding elements to df
175
		df = dfadd(df, list(key1 = "value1'", key2 = "value2'"))
176
		print(df)
177
})
178

    
179

    
180
sign_from_strand = function(
181
### Get the sign of strand
182
strands) {
183
	apply(t(strands), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
184
### If strand in forward then returns 1 else returns -1
185
}
186

    
187
flat_reads = function(
188
### Extract reads coordinates from TempleteFilter input sequence
189
reads, ##<< TemplateFilter input reads
190
nuc_width ##<< Width used to shift F and R reads.
191
) {
192
	F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])}))
193
	R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])}))
194
	flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads))  )
195
	return(list(F_flatted_reads, R_flatted_reads, flatted_reads))
196
### Returns a list of F reads, R reads and joint/shifted F and R reads.
197
}
198

    
199

    
200

    
201
filter_tf_outputs = function(# Filter TemplateFilter outputs
202
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlapping threshold.
203
tf_outputs, ##<< TemplateFilter outputs.
204
chr, ##<< Chromosome observed, here chr is an integer.
205
x_min, ##<< Coordinate of the first bp observed.
206
x_max, ##<< Coordinate of the last bp observed.
207
nuc_width = 160, ##<< Nucleosome width.
208
ol_bp = 59, ##<< Overlap Threshold.
209
corr_thres = 0.5 ##<< Correlation threshold.
210
) {
211
  if (x_min < 0) {
212
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= -x_max & tf_outputs$center + tf_outputs$width/2 <=  -x_min,]
213
	} else {
214
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= x_min & tf_outputs$center + tf_outputs$width/2 <= x_max,]
215
  }
216
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
217
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
218
  tf_outputs = tf_outputs[tf_outputs$correlation.score >= corr_thres,]
219
  tf_outputs = tf_outputs[order(tf_outputs$correlation.score, decreasing=TRUE),]
220
  i = 1
221
  while (i <= length(tf_outputs[,1])) {
222
    lb = tf_outputs[i,]$lower_bound
223
    ub = tf_outputs[i,]$upper_bound
224
    tf_outputs = tf_outputs[!(tf_outputs$lower_bound <= (ub-ol_bp) & tf_outputs$upper_bound > ub) & !(tf_outputs$upper_bound >= (lb+ol_bp) & tf_outputs$lower_bound < lb),]
225
    i = i+1
226
  }
227
  return(tf_outputs)
228
### Returns filtered TemplateFilter Outputs
229
}
230

    
231

    
232

    
233
filter_tf_inputs = function(# Filter TemplateFilter inputs
234
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
235
inputs, ##<< TF inputs to be filtered.
236
chr, ##<< Chromosome observed, here chr is an integer.
237
x_min, ##<< Coordinate of the first bp observed.
238
x_max, ##<< Coordinate of the last bp observed.
239
nuc_width = 160, ##<< Nucleosome width.
240
only_f = FALSE, ##<< Filter only F reads.
241
only_r = FALSE, ##<< Filter only R reads.
242
filter_for_coverage = FALSE ##<< Does it filter for plot coverage?
243
) {
244
	if (only_f) {
245
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
246
	} else if (only_r) {
247
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]
248
	} else {
249
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
250
	}
251
  if (!filter_for_coverage) {
252
    corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
253
    inputs = inputs[inputs[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]
254
  }
255
	return(inputs)
256
### Returns filtred inputs.
257
}
258

    
259

    
260

    
261
get_comp_strand = function(
262
### Compute the complementatry strand.
263
strand ##<< The original strand.
264
) {
265
	apply(t(strand),2, function(n){
266
	  if (n=="a") {return("t")}
267
		if (n=="t") {return("a")}
268
		if (n=="c") {return("g")}
269
		if (n=="g") {return("c")}
270
	})
271
### Returns the complementatry strand.
272
}
273

    
274

    
275
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes.
276
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{llr_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
277
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
278
llr_thres=20, ##<< Log likelihood ration threshold.
279
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
280
){
281
	end_of_tracks = function(tracks) {
282
		if (length(tracks) == 0) {
283
			return(TRUE)
284
		}
285
	  for (lower_bound in tracks) {
286
			if(!is.na(lower_bound)) {
287
	      if (lower_bound < coord_max) {
288
	        return(FALSE)
289
	      }
290
	  	}
291
	  }
292
	  return(TRUE)
293
	}
294
	store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,nb_tracks, min_nuc_center, max_nuc_center) {
295
		if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) {
296
			new_cluster$wp = TRUE
297
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
298
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
299
		  	clusters[[length(clusters) + 1]] = new_cluster
300
				# print(new_cluster)
301
		  }
302
		} else {
303
			new_cluster$wp = FALSE
304
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
305
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
306
			  clusters[[length(clusters) + 1]] = new_cluster
307
			}
308
		}
309
		return(clusters)
310
	}
311
	strain = samples[[1]]$strain
312
	llr_scores = c()
313
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
314
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
315
  # compute clusters
316
  clusters = list()
317
  cluster_contents = list()
318
  # Init reader
319
  indexes = c()
320
  track_readers = c()
321
  current_nuc = NULL
322
	llr_score = llr_thres + 1
323
  # Read nucs from TF outputs
324
  tf_outs = list()
325
	i = 1
326
  for (sample in samples) {
327
		tf_outs[[i]] = sample$outputs
328
		tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),]
329
    indexes[i] = 1
330
		if (is.na(tf_outs[[i]][indexes[i],]$center)) {
331
      track_readers[i] = coord_max
332
	  } else {
333
      track_readers[i] = tf_outs[[i]][indexes[i],]$center
334
		}
335
		i = i+1
336
  }
337
	# print(track_readers)
338
  new_cluster = NULL
339
  nb_nucs_in_cluster = 0
340
  nuc_from_track = c()
341
  for (i in 1:length(tf_outs)){
342
    nuc_from_track[i] = FALSE
343
  }
344
  # Start clustering
345
  while (!end_of_tracks(track_readers)) {
346
    new_center = min(track_readers)
347
		current_track = which(track_readers == new_center)[1]
348
    new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],])
349
		new_nuc$chr = substr(new_nuc$chr,4,1000000L)
350
		new_nuc$inputs = samples[[current_track]]$inputs
351
		new_nuc$chr = samples[[current_track]]$roi$chr
352
		new_nuc$track = current_track
353

    
354
		new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width)
355
		flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width)
356
		new_nuc$original_reads = flatted_reads[[3]]
357

    
358
    new_upper_bound = new_nuc$upper_bound
359

    
360
    if (!is.null(current_nuc)) {
361
			llr_score = llr_score_nvecs(list(current_nuc$original_reads,new_nuc$original_reads))
362
			llr_scores = c(llr_scores,llr_score)
363
		}
364
		# print(paste(llr_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
365
		if (is.na(llr_score)) {
366
			llr_score = llr_thres + 1
367
		}
368
		# Store llr_score
369
		new_nuc$llr_score = llr_score
370
	  if (llr_score < llr_thres) {
371
      # aggregate to current cluster
372
      #   update bound
373
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
374
        new_cluster$upper_bound = new_nuc$upper_bound
375
      }
376
      if (new_nuc$lower_bound < new_cluster$lower_bound) {
377
        new_cluster$lower_bound = new_nuc$lower_bound
378
      }
379
      #   add nucleosome to current cluster
380
      nuc_from_track[current_track] = TRUE
381
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
382
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
383
    } else {
384
			if (!is.null(new_cluster)) {
385
        # store old cluster
386
	      clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
387
			}
388
      # Reinit current cluster composition stuff
389
      nb_nucs_in_cluster = 0
390
      nuc_from_track = c()
391
      for (i in 1:length(tf_outs)){
392
        nuc_from_track[i] = FALSE
393
      }
394
      # create new cluster
395
      new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list())
396
      # update upper bound
397
      current_upper_bound = new_upper_bound
398
      # add nucleosome to current cluster
399
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
400
      nuc_from_track[current_track] = TRUE
401
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
402

    
403
		}
404

    
405
		current_nuc = new_nuc
406

    
407
    # update indexes
408
    if (indexes[current_track] < length(tf_outs[[current_track]]$center)) {
409
      indexes[current_track] = indexes[current_track] + 1
410
      # update track
411
      track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center
412
    } else {
413
      # update track
414
      track_readers[current_track] = coord_max
415
    }
416
  }
417
  # store last cluster
418
  if (!is.null(new_cluster)) {
419
    # store old cluster
420
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
421
  }
422
	return(list(clusters, llr_scores))
423
### Returns a list of clusterized nucleosomes, and all computed llr scores.
424
}, ex=function(){
425
	# Dealing with a region of interest
426
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
427
	samples = list()
428
	for (i in 1:3) {
429
		# Create TF output
430
		tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
431
		outputs = dfadd(NULL,tf_nuc)
432
		outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
433
		# Generate corresponding reads
434
		nb_reads = round(runif(1,170,230))
435
		reads = round(rnorm(nb_reads, tf_nuc$center,20))
436
		u_reads = sort(unique(reads))
437
		strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
438
		counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
439
		shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
440
		u_reads = u_reads + shifts
441
		inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
442
		                         "V2" = u_reads,
443
														 "V3" = strands,
444
														 "V4" = counts), stringsAsFactors=FALSE)
445
		samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
446
	}
447
	print(aggregate_intra_strain_nucs(samples))
448
})
449

    
450
flat_aggregated_intra_strain_nucs = function(# to flat aggregate_intra_strain_nucs function output
451
### This function builds a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
452
partial_strain_maps, ##<< the output of aggregate_intra_strain_nucs function
453
cur_index ##<< the index of the roi involved
454
) {
455
	if  (length(partial_strain_maps) == 0 ){
456
		print(paste("Empty partial_strain_maps for roi", cur_index, "ands current strain." ))
457
    tmp_strain_maps = list()
458
	} else {
459
		tmp_strain_map = apply(t(1:length(partial_strain_maps)), 2, function(i){
460
			tmp_nuc = partial_strain_maps[[i]]
461
			tmp_nuc_as_list = list()
462
			tmp_nuc_as_list[["chr"]] = tmp_nuc[["chr"]]
463
			tmp_nuc_as_list[["lower_bound"]] = ceiling(tmp_nuc[["lower_bound"]])
464
			tmp_nuc_as_list[["upper_bound"]] = floor(tmp_nuc[["upper_bound"]])
465
			tmp_nuc_as_list[["cur_index"]] = cur_index
466
			tmp_nuc_as_list[["index_nuc"]] = i
467
			tmp_nuc_as_list[["wp"]] = as.integer(tmp_nuc$wp)
468
			all_original_reads = c()
469
			for (j in 1:length(tmp_nuc$nucs)) {
470
				all_original_reads = c(all_original_reads, tmp_nuc$nucs[[j]]$original_reads)
471
			}
472
			tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads)
473
			tmp_nuc_as_list[["nb_nucs"]] = length(tmp_nuc$nucs)
474
			if (tmp_nuc$wp) {
475
				tmp_nuc_as_list[["llr_1"]] = signif(tmp_nuc$nucs[[2]]$llr_score,5)
476
				tmp_nuc_as_list[["llr_2"]] = signif(tmp_nuc$nucs[[3]]$llr_score,5)
477
			} else {
478
				tmp_nuc_as_list[["llr_1"]] = NA
479
				tmp_nuc_as_list[["llr_2"]] = NA
480
			}
481
      return(tmp_nuc_as_list)
482
    })
483
    tmp_strain_maps = do.call("rbind", tmp_strain_map)
484
	}
485
  return(data.frame(lapply(data.frame(tmp_strain_maps, stringsAsFactors=FALSE), unlist), stringsAsFactors=FALSE))
486
### Returns a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
487
}
488

    
489
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains.
490
### This function aligns nucs between two strains for a given genome region.
491
replicates, ##<< Set of replicates, ideally 3 per strain.
492
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's null this list will be computed.
493
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's null this list will be computed.
494
corr_thres=0.5, ##<< Correlation threshold.
495
llr_thres=100, ##<< LOD cut off.
496
config=NULL, ##<< GLOBAL config variable
497
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.
498
) {
499
	if (length(replicates) < 2) {
500
		stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.")
501
	} else if (length(replicates) > 2) {
502
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")
503
	}
504
	common_nuc = NULL
505
	llr_scores = c()
506
	chr = replicates[[1]][[1]]$roi$chr
507
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
508
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
509
	strain_ref1 = replicates[[1]][[1]]$strain
510
	strain_ref2 = replicates[[2]][[1]]$strain
511
	big_cur = replicates[[1]][[1]]$roi
512
  orig_big_cur = replicates[[1]][[1]]$orig_roi
513
	if(big_cur$end - big_cur$begin < 0) {
514
		tmp_begin = big_cur$begin
515
		big_cur$begin =  big_cur$end
516
		big_cur$end =  tmp_begin
517
	}
518
	# GO!
519
	if (is.null(wp_nucs_strain_ref1)) {
520
		wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]]
521
	}
522
	if (is.null(wp_nucs_strain_ref2)) {
523
	  wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]]
524
  }
525
	lws = c()
526
	ups = c()
527
	for (na in wp_nucs_strain_ref2) {
528
		lws = c(lws, na$lower_bound)
529
		ups = c(ups, na$upper_bound)
530
	}
531

    
532
	print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep=""))
533
	roi_strain_ref1 = NULL
534
	roi_strain_ref2 = NULL
535
	if (length(wp_nucs_strain_ref1) > 0) {
536
		for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){
537
			# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep=""))
538
			nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]]
539
			# Filtering on Well Positionned
540
			if (nuc_strain_ref1$wp) {
541
				roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1)
542
				roi_strain_ref2 = translate_cur(roi_strain_ref1, strain_ref2, big_cur=orig_big_cur, config=config)
543
        if (!is.null(roi_strain_ref2)){
544
					# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER)
545
					lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin)
546
					upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin)
547
					matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2)
548
					for (index_nuc_strain_ref2 in matching_nas) {
549
						nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]]
550
						# Filtering on Well Positionned
551
    				nuc_strain_ref2_to_roi = list(begin=nuc_strain_ref2$lower_bound, end=nuc_strain_ref2$upper_bound, chr=nuc_strain_ref2$chr, strain_ref = strain_ref2)
552
						if (!is.null(translate_cur(nuc_strain_ref2_to_roi, strain_ref1, big_cur=orig_big_cur, config=config)) &
553
                nuc_strain_ref2$wp) {
554
							# Filtering on correlation Score and collecting reads
555
							SKIP = FALSE
556
							# TODO: This for loop could be done before working on strain_ref2. Isn't it?
557
							reads_strain_ref1 = c()
558
							for (nuc in nuc_strain_ref1$nucs){
559
								reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads)
560
								if (nuc$corr < corr_thres) {
561
									SKIP = TRUE
562
								}
563
							}
564
							reads_strain_ref2 = c()
565
							for (nuc in nuc_strain_ref2$nucs){
566
								reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads)
567
								if (nuc$corr < corr_thres) {
568
									SKIP = TRUE
569
								}
570
							}
571
							# Filtering on correlation Score
572
							if (!SKIP) {
573
								# tranlation of reads into strain 2 coords
574
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
575
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
576
								llr_score = llr_score_nvecs(list(reads_strain_ref1, reads_strain_ref2))
577
								llr_scores = c(llr_scores, llr_score)
578
								# Filtering on LOD Score
579
                # if (llr_score < llr_thres) {
580
								if (TRUE) {
581
									tmp_nuc = list()
582
									# strain_ref1
583
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
584
									tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound
585
									tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound
586
									tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5)
587
									tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5)
588
									tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1)
589
									tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1
590
									# strain_ref2
591
									tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr
592
									tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound
593
									tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound
594
									tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5)
595
									tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5)
596
									tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2)
597
									tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2
598
									# common
599
									tmp_nuc[["llr_score"]] = signif(llr_score,5)
600
									common_nuc = dfadd(common_nuc, tmp_nuc)
601
                }
602
							}
603
						}
604
					}
605
				} else {
606
		      print("WARNING! No roi for strain ref 2.")
607
			  }
608
		  }
609
		}
610

    
611
		if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) {
612
			index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] ==  common_nuc[,1:3][-1,] ,1,sum) == 3)
613
			to_remove_list = c()
614
			for (i in 1:length(index_redundant)) {
615
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
616
				  to_remove = index_redundant[i]
617
				}	 else {
618
					to_remove = index_redundant[i] + 1
619
			  }
620
				to_remove_list = c(to_remove_list, to_remove)
621
			}
622
			common_nuc = common_nuc[-to_remove_list,]
623
		}
624

    
625
		if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) {
626
			index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] == common_nuc[,8:10][-1,] ,1,sum) == 3)
627
			to_remove_list = c()
628
			for (i in 1:length(index_redundant)) {
629
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
630
				  to_remove = index_redundant[i]
631
				}	 else {
632
					to_remove = index_redundant[i] + 1
633
			  }
634
				to_remove_list = c(to_remove_list, to_remove)
635
			}
636
			common_nuc = common_nuc[-to_remove_list,]
637
		}
638

    
639
		return(list(common_nuc, llr_scores))
640
	} else {
641
		print("WARNING, no nucs for strain_ref1.")
642
		return(NULL)
643
	}
644
### Returns a list of clusterized nucleosomes, and all computed llr scores.
645
}, ex=function(){
646

    
647
    # Define new translate_cur function...
648
    translate_cur = function(roi, strain2, big_cur=NULL, config=NULL) {
649
      return(roi)
650
    }
651
    # Binding it by uncomment follwing lines.
652
    unlockBinding("translate_cur", as.environment("package:nucleominer"))
653
    unlockBinding("translate_cur", getNamespace("nucleominer"))
654
    assign("translate_cur", translate_cur, "package:nucleominer")
655
    assign("translate_cur", translate_cur, getNamespace("nucleominer"))
656
    lockBinding("translate_cur", getNamespace("nucleominer"))
657
    lockBinding("translate_cur", as.environment("package:nucleominer"))
658

    
659
	# Dealing with a region of interest
660
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1")
661
	roi2 = translate_cur(roi, roi$strain_ref1)
662
	replicates = list()
663
	for (j in 1:2) {
664
		samples = list()
665
		for (i in 1:3) {
666
			# Create TF output
667
			tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
668
			outputs = dfadd(NULL,tf_nuc)
669
			outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
670
			# Generate corresponding reads
671
			nb_reads = round(runif(1,170,230))
672
			reads = round(rnorm(nb_reads, tf_nuc$center,20))
673
			u_reads = sort(unique(reads))
674
			strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
675
			counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
676
			shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
677
			u_reads = u_reads + shifts
678
			inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
679
			                         "V2" = u_reads,
680
															 "V3" = strands,
681
															 "V4" = counts), stringsAsFactors=FALSE)
682
			samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
683
		}
684
		replicates[[length(replicates) + 1]] = samples
685
	}
686
	print(align_inter_strain_nucs(replicates))
687
})
688

    
689

    
690

    
691

    
692

    
693

    
694

    
695

    
696

    
697

    
698

    
699

    
700

    
701

    
702

    
703

    
704

    
705

    
706

    
707

    
708

    
709

    
710

    
711

    
712

    
713

    
714
fetch_mnase_replicates = function(# Prefetch data
715
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates.
716
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids.
717
roi, ##<< Region of interest.
718
all_samples, ##<< Global list of samples.
719
config=NULL, ##<< GLOBAL config variable
720
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking.
721
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence.
722
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files.
723
) {
724
	samples=list()
725
  samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id)
726
	for (i in samples_ids) {
727
		sample = as.list(all_samples[all_samples$id==i,])
728
    sample$orig_roi = roi
729
    sample$roi = translate_cur(roi, sample$strain, config = config)
730
		if (get_genome) {
731
			# Get Genome
732
      sample$roi$genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]], "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
733
		}
734
		# Get inputs
735
		sample$inputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), "table", stringsAsFactors=FALSE)
736
		sample$total_reads = sum(sample$inputs[,4])
737
		if (!only_fetch) {
738
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
739
	  }
740
	  # Get TF outputs for Mnase_Seq samples
741
		if (sample$marker == "Mnase_Seq" & get_ouputs) {
742
			sample$outputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), "table", header=TRUE, sep="\t")
743
			if (!only_fetch) {
744
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
745
  		}
746
		}
747
		samples[[length(samples) + 1]] = sample
748
	}
749
  return(samples)
750
}
751

    
752
substract_region = function(# Substract to a list of regions an other list of regions that intersect it.
753
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two.
754
region1, ##<< Original regions.
755
region2 ##<< Regions to substract.
756
) {
757
  rec_substract_region = function(region1, region2) {
758
  non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) {
759
    cur_fuzzy = region1[i,]
760
    inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,]
761
    if (length(inter_wp[,1]) > 0) {
762
      ret = c()
763
      for (j in 1:length(inter_wp[,1])) {
764
        cur_wp = inter_wp[j,]
765
        if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
766
          # remove cur_fuzzy
767
          ret = c()
768
          break
769
        } else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
770
          # crop fuzzy
771
          cur_fuzzy$lower_bound = cur_wp$upper_bound + 1
772
          ret = cur_fuzzy
773
        } else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
774
          # crop fuzzy
775
          cur_fuzzy$upper_bound = cur_wp$lower_bound - 1
776
          ret = cur_fuzzy
777
        } else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
778
          # split fuzzy
779
          tmp_ret_fuzzy_1 = cur_fuzzy
780
          tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1
781
          tmp_ret_fuzzy_2 = cur_fuzzy
782
          tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1
783
          ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp)
784
          # print(ret)
785
          # ret = cur_fuzzy
786
          break
787
        } else {
788
          stop("WARNING NO ADAPTED CASE!")
789
        }
790
      }
791
      return(ret)
792
    } else {
793
      return(cur_fuzzy)
794
    }
795
  })
796
  }
797
  non_inter_fuzzy = rec_substract_region(region1[,1:4], region2[,1:4])
798
  if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)}
799
  tmp_ulist = unlist(non_inter_fuzzy)
800
  tmp_names = names(tmp_ulist)[1:4]
801
  non_inter_fuzzy = data.frame(matrix(tmp_ulist, ncol=4, byrow=TRUE), stringsAsFactors=FALSE)
802
  names(non_inter_fuzzy) = tmp_names
803
  non_inter_fuzzy$chr = as.character(non_inter_fuzzy$chr)
804
  non_inter_fuzzy$chr = as.numeric(non_inter_fuzzy$chr)
805
  non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound)
806
  non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound)
807
  non_inter_fuzzy = non_inter_fuzzy[order(non_inter_fuzzy$lower_bound),]
808
  return(non_inter_fuzzy)
809
}
810

    
811
union_regions = function(# Aggregate regions that intersect themnselves.
812
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable.
813
regions ##<< The Regions to be aggregated
814
) {
815
  if (is.null(regions)) {return(regions)}
816
  if (nrow(regions) == 0) {return(regions)}
817
  old_length = length(regions[,1])
818
  new_length = 0
819

    
820
  while (old_length != new_length) {
821
    regions = regions[order(regions$lower_bound), ]
822
    regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 1, TRUE)
823
    vec_end_1 = which(regions$stop)
824
    if (length(vec_end_1) == 0) {
825
      vec_end_1 = c(length(regions$stop))
826
    }
827
    if (vec_end_1[length(vec_end_1)] != length(regions$stop)) {
828
      vec_end_1 = c(vec_end_1, length(regions$stop))
829
    }
830
    vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1)
831
    union = apply(t(1:length(vec_beg_1)), 2, function(i) {
832
      chr = regions$chr[vec_beg_1[i]]
833
      lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]])
834
      upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]])
835
      cur_index = regions$cur_index[vec_beg_1[i]]
836
      data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, cur_index=cur_index))
837
      })
838
    union = collapse_regions(union)
839
    old_length = length(regions[,1])
840
    new_length = length(union[,1])
841
    regions = union
842
  }
843
  return(union)
844
}
845

    
846
# remove_aligned_wp = function(# Remove wp nucs from common nucs list.
847
# ### It is based on common wp nucs index on nucs and region.
848
# strain_maps, ##<< Nuc maps.
849
# cur_index, ##<< The region of interest index.
850
# tmp_common_nucs, ##<< the list of wp nucs.
851
# strain##<< The strain to consider.
852
# ){
853
#   fuzzy_nucs = strain_maps[[strain]]
854
#   fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$cur_index == cur_index,]
855
#   fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),]
856
#   if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)}
857
#   if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"}
858
#   anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]
859
#   fuzzy_nucs = fuzzy_nucs[-anti_index_1,]
860
#   return(fuzzy_nucs)
861
# }
862

    
863
translate_regions = function(# Translate a list of regions from a strain ref to another.
864
### This function is an eloborated call to translate_cur.
865
regions, ##<< Regions to be translated.
866
combi, ##<< Combination of strains.
867
cur_index, ##<< The region of interest index.
868
config=NULL, ##<< GLOBAL config variable
869
roi ##<< The region of interest.
870
) {
871
  tr_regions = apply(t(1:length(regions[,1])), 2, function(i) {
872
    tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2])
873
    big_cur =  roi
874
    trs_tmp_regions_ref2 = translate_cur(tmp_regions_ref2, combi[1], config = config, big_cur = big_cur)
875
    if (is.null(trs_tmp_regions_ref2)) {
876
      return(NULL)
877
    } else {
878
      return(data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), cur_index=cur_index)))
879
    }
880
  })
881

    
882
  return(collapse_regions(tr_regions))
883
}
884

    
885
collapse_regions = function(# reformat an "apply  manipulated" list of regions
886
### Utils to reformat an "apply  manipulated" list of regions
887
regions ##< a list of regions
888
) {
889
  if (is.null(regions)) {
890
    return(NULL)
891
  } else {
892
    regions = do.call(rbind, regions)
893
    regions$chr = as.character(regions$chr)
894
    regions$chr = as.numeric(regions$chr)
895
    regions$lower_bound = as.numeric(regions$lower_bound)
896
    regions$upper_bound = as.numeric(regions$upper_bound)
897
    regions = regions[order(regions$lower_bound),]
898
    return(regions)
899
  }
900
}
901

    
902

    
903
crop_fuzzy = function(# Crop bound of regions according to region of interest bound
904
### The fucntion is no more necessary since we remove "big_cur" bug in translate_cur function.
905
tmp_fuzzy_nucs, ##<< the regiuons to be croped.
906
roi, ##<< The region of interest.
907
strain, ##<< The strain to consider.
908
config=NULL ##<< GLOBAL config variable
909
) {
910
  tr_roi = translate_cur(roi, strain, config = config)
911
  tr_roi_begin = min(tr_roi$begin, tr_roi$end)
912
  tr_roi_end = max(tr_roi$begin, tr_roi$end)
913
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) {
914
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin
915
  }
916
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) {
917
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin
918
  }
919
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) {
920
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end
921
  }
922
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) {
923
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end
924
  }
925
  tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,]
926
  return(tmp_fuzzy_nucs)
927
}
928

    
929
get_all_reads = function(# Retrieve Reads
930
### Retrieve reads for a given marker, combi, form.
931
marker, ##<< The marker to considere.
932
combi, ##<< The starin combination to considere.
933
form="wp", ##<< The nuc form to considere.
934
config=NULL ##<< GLOBAL config variable
935
) {
936
	all_reads = NULL
937
  for (manip in c("Mnase_Seq", marker)) {
938
    if (form == "unr") {
939
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="")
940
  		tmp_res = read.table(file=out_filename, header=TRUE)
941
			tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,]
942
      tmp_res$form = form
943
    } else if (form == "wp") {
944
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
945
  		tmp_res = read.table(file=out_filename, header=TRUE)
946
      tmp_res$form = form
947
    } else if (form == "wpunr") {
948
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
949
  		tmp_res = read.table(file=out_filename, header=TRUE)
950
      tmp_res$form = "wp"
951
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="")
952
  		tmp_res2 = read.table(file=out_filename, header=TRUE)
953
			tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,]
954
      tmp_res2$form = "unr"
955
      tmp_res = rbind(tmp_res, tmp_res2)
956
    }
957
		if (is.null(all_reads)) {
958
			all_reads = tmp_res[,c(1:9,length(tmp_res))]
959
		}
960
		tmp_res = tmp_res[,-c(1:9,length(tmp_res))]
961
		all_reads = cbind(all_reads, tmp_res)
962
  }
963
  return(all_reads)
964
}
965

    
966
get_design = function(# Build the design for deseq
967
### This function build the design according sample properties.
968
marker, ##<< The marker to considere.
969
combi, ##<< The starin combination to considere.
970
all_samples ##<< Global list of samples.
971
) {
972
  off1 = 0
973
  off2 = 0
974
	manips = c("Mnase_Seq", marker)
975
	design_rownames = c()
976
	design_manip = c()
977
	design_strain = c()
978
  off2index = function(off) {
979
  	switch(toString(off),
980
  		"1"=c(0,1,1),
981
  	  "2"=c(1,0,1),
982
    	"3"=c(1,1,0),
983
  		c(1,1,1)
984
  		)
985
  }
986
	for (manip in manips) {
987
		tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) &  all_samples$marker == manip), ]
988
		tmp_samples = tmp_samples[order(tmp_samples$strain), ]
989
		if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) {
990
			tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,]
991
		} else {
992
			if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) {
993
				tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,]
994
			}
995
		}
996
		design_manip = c(design_manip, rep(manip, length(tmp_samples$id)))
997
		for (strain in combi) {
998
			cols = apply(t(tmp_samples[ (tmp_samples$strain == strain &  tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")})
999
			design_strain = c(design_strain, rep(strain, length(cols)))
1000
			design_rownames = c(design_rownames, cols)
1001
		}
1002
	}
1003
	snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain)
1004
	return(snep_design)
1005
}
1006

    
1007
plot_dist_samples = function(# Plot the distribution of reads.
1008
### This fuxntion use the deseq nomalization feature to compare qualitatively the distribution.
1009
strain, ##<< The strain to considere.
1010
marker, ##<< The marker to considere.
1011
res, ##<< Data
1012
all_samples, ##<< Global list of samples.
1013
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot.
1014
) {
1015
	cols = apply(t(all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")})
1016
	snepCountTable = res[,cols]
1017
	snepDesign = data.frame(
1018
		row.names = cols,
1019
		manip = rep(marker, length(cols)),
1020
		strain = rep(strain, length(cols))
1021
		)
1022
	cdsFull = newCountDataSet(snepCountTable, snepDesign)
1023
	sizeFactors = estimateSizeFactors(cdsFull)
1024
	# print(sizeFactors[[1]])
1025
	sample_ids = all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id
1026
	if (NEWPLOT) {
1027
		plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker))
1028
		NEWPLOT = FALSE
1029
	}
1030
	for (it in 1:length(sample_ids)) {
1031
		sample_id = sample_ids[it]
1032
		lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it)
1033
	}
1034
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
1035
}
1036

    
1037
analyse_design = function(# Launch deseq methods.
1038
### This function is based on deseq example. It mormalizes data, fit data to GLM model with and without interaction term and compare the two l;=models.
1039
snep_design, ##<< The design to considere.
1040
reads ##<< The data to considere.
1041
) {
1042
	snep_count_table = reads[, rownames(snep_design)]
1043
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1044
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1045
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1046

    
1047
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1048
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1049

    
1050
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1051
	return(list(fit1, fit0, snep_design, pvalsGLM))
1052
}
1053

    
1054

    
1055

    
1056

    
1057

    
1058

    
1059

    
1060

    
1061
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1062
### This function uses
1063
marker, ##<< The marker involved.
1064
combi, ##<< The strain combination involved.
1065
form, ##<< the nuc form involved.
1066
all_samples, ##<< Global list of samples.
1067
config=NULL ##<< GLOBAL config variable
1068
) {
1069
  # PRETREAT
1070
  snep_design = get_design(marker, combi, all_samples)
1071
  reads = get_all_reads(marker, combi, form, config=config)
1072
  # RUN ANALYSE
1073
  tmp_analyse = analyse_design(snep_design, reads)
1074
  # RESULTS
1075
	fit1 = tmp_analyse[[1]]
1076
	fit0 = tmp_analyse[[2]]
1077
  k = names(fit1)
1078
  reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1079
  reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1080
  reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1081
	reads$pvalsGLM = signif(tmp_analyse[[4]], 5)
1082
	snep_design = tmp_analyse[[3]]
1083
  # print(snep_design)
1084
	fdr = 0.0001
1085
	thres = FDR(reads$pvalsGLM, fdr)
1086
	reads$snep_index = reads$pvalsGLM < thres
1087
	print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", fdr*100,"% of FDR.", sep = ""))
1088
  return(reads)
1089
  },  ex=function(){
1090
    marker = "H3K4me1"
1091
    combi = c("BY", "YJM")
1092
    form = "wpunr" # "wp" | "unr" | "wpunr"
1093
    # foo = get_sneps(marker, combi, form)
1094
    # foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
1095
})
1096

    
1097

    
1098

    
1099

    
1100

    
1101

    
1102

    
1103

    
1104

    
1105

    
1106

    
1107

    
1108

    
1109

    
1110

    
1111

    
1112

    
1113

    
1114

    
1115

    
1116

    
1117

    
1118

    
1119

    
1120
ROM2ARAB = function(# Roman to Arabic pair list.
1121
### Util to convert Roman to Arabic
1122
){list(
1123
  "I" = 1,
1124
  "II" = 2,
1125
  "III" = 3,
1126
  "IV" = 4,
1127
  "V" = 5,
1128
  "VI" = 6,
1129
  "VII" = 7,
1130
  "VIII" = 8,
1131
  "IX" = 9,
1132
  "X" = 10,
1133
  "XI" = 11,
1134
  "XII" = 12,
1135
  "XIII" = 13,
1136
  "XIV" = 14,
1137
  "XV" = 15,
1138
  "XVI" = 16,
1139
  "XVII" = 17,
1140
  "XVIII" = 18,
1141
  "XIX" = 19,
1142
  "XX" = 20
1143
)}
1144

    
1145
switch_pairlist = structure(function(# Switch a pairlist
1146
### Take a pairlist key:value and return the switched pairlist value:key.
1147
l ##<< The pairlist to switch.
1148
) {
1149
	ret = list()
1150
	for (name in names(l)) {
1151
		ret[[as.character(l[[name]])]] = name
1152
	}
1153
	ret
1154
### The switched pairlist.
1155
}, ex=function(){
1156
	l = list(key1 = "value1", key2 = "value2")
1157
	print(switch_pairlist(l))
1158
})
1159

    
1160
ARAB2ROM = function(# Arabic to Roman pair list.
1161
### Util to convert Arabicto Roman
1162
){switch_pairlist(ROM2ARAB())}
1163

    
1164

    
1165

    
1166

    
1167
c2c_extraction = function(# Extract a sub part of the corresponding c2c file
1168
### This fonction allow to acces to a specific part of the c2c file.
1169
strain1, ##<< the key strain
1170
strain2, ##<< the target strain
1171
chr=NULL, ##<< if defined, the c2c will filtered according to the chromosome value
1172
lower_bound=NULL, ##<< if defined, the c2c will filtered for part of the genome upper than lower_bound
1173
upper_bound=NULL, ##<< if defined, the c2c will filtered for part of the genome lower than upper_bound
1174
config=NULL##<<  GLOBAL config variable
1175
) {
1176
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1177
	# Launch c2c file
1178
	if (reverse) {
1179
		c2c_filename = config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]]
1180
	} else {
1181
		c2c_filename = config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]]
1182
	}
1183
	c2c = get_content(c2c_filename, "table", stringsAsFactors=FALSE)
1184
  # Filtering unagapped
1185
  c2c = c2c[c2c$V6=="-",]
1186
	# Reverse
1187
	if (reverse) {
1188
		tmp_col = c2c$V1
1189
		c2c$V1 = c2c$V7
1190
		c2c$V7 = tmp_col
1191
		tmp_col = c2c$V2
1192
		c2c$V2 = c2c$V9
1193
		c2c$V9 = tmp_col
1194
		tmp_col = c2c$V3
1195
		c2c$V3 = c2c$V10
1196
		c2c$V10 = tmp_col
1197
	}
1198
  if (!is.null(chr)) {
1199
  	if (strain1 == "BY") {
1200
  		chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="")
1201
  	} else if (strain1 == "RM") {
1202
  	  chro_1 = paste("supercontig_1.",chr,sep="")
1203
  	} else if (strain1 == "YJM") {
1204
  	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]]
1205
  	}
1206
  	c2c = c2c[c2c$V1 == chro_1,]
1207
    if (!is.null(lower_bound)) {
1208
      if (length(c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3,c("V2", "V3") ] = lower_bound}
1209
      if (length(c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2,c("V2", "V3") ] = lower_bound}      
1210
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1211
    }
1212
    if (!is.null(upper_bound)) {
1213
      if (length(c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, c("V2", "V3")] = upper_bound}
1214
      if (length(c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, c("V2", "V3")] = upper_bound}      
1215
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1216
    }
1217
  }
1218
  return(c2c)
1219
# It retruns the appropriate c2c file part.
1220
}
1221

    
1222
translate_cur = structure(function(# Translate coords of a genome region.
1223
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1224
roi, ##<< Original genome region of interest.
1225
strain2, ##<< The strain in wich you want the genome region of interest.
1226
config=NULL, ##<< GLOBAL config variable
1227
big_cur=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1228
) {
1229
	strain1 = roi$strain_ref
1230
	if (strain1 == strain2) {
1231
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1
1232
		return(roi)
1233
	}
1234

    
1235
	# Extract c2c file
1236
	if (!is.null(big_cur)) {
1237
  	# Dealing with big_cur
1238
		if (roi$strain_ref != big_cur$strain_ref) {
1239
      big_cur = translate_cur(big_cur, roi$strain_ref, config=config)
1240
    }
1241
    if (big_cur$end < big_cur$begin) {
1242
      tmp_var = big_cur$begin
1243
      big_cur$begin = big_cur$end
1244
      big_cur$end = tmp_var
1245
      big_cur$length = big_cur$end - big_cur$begin + 1
1246
    }
1247
    if (big_cur$chr!=roi$chr | roi$end > big_cur$end | roi$end < big_cur$begin | roi$begin > big_cur$end | roi$begin < big_cur$begin) {
1248
      print("WARNING! Trying to translate a roi not included in a big_cur.")
1249
      return(NULL)
1250
    }
1251
  	c2c = c2c_extraction(strain1, strain2, big_cur$chr, big_cur$begin, big_cur$end, config=config)
1252
  } else {
1253
    # No big_cur
1254
  	c2c = c2c_extraction(strain1, strain2, roi$chr, config=config)    
1255
  }
1256
  
1257
  #	Convert initial roi$chr into c2c format
1258
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1259
	begin_1 = roi$begin
1260
  end_1 = roi$end
1261
  if (reverse) {
1262
  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1263
    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1264
	} else {
1265
		tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1266
	  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1267
	}
1268

    
1269
	# Never happend conditions ...
1270
	{
1271
		if (length(tmptransfostart$V8) == 0) {
1272
			# begin_1 is between to lines: shift begin_1 to the start of 2nd line.
1273
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1274
  			tmp_c2c = c2c[c2c$V2>=begin_1,]
1275
  			begin_1 = min(tmp_c2c$V2)
1276
      } else {
1277
  			tmp_c2c = c2c[c2c$V3>=begin_1,]
1278
  			begin_1 = min(tmp_c2c$V3)
1279
      }
1280
			if (reverse) {
1281
		  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1282
			} else {
1283
				tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1284
			}
1285
			if (length(tmptransfostart$V8) == 0) {
1286
				if (!is.null(big_cur)) {
1287
					return(NULL)
1288
					tmptransfostart = c2c[c2c$V3>=big_cur$begin & c2c$V2<=big_cur$begin,]
1289
				} else {
1290
					print(tmptransfostart)
1291
					print(tmptransfostop)
1292
					stop("Never happend condition 1.")
1293
				}
1294
			}
1295
		}
1296
		if (length(tmptransfostop$V8) == 0) {
1297
			# end_1 is between to lines: shift end_1 to the end of 2nd line.
1298
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1299
  			tmp_c2c = c2c[c2c$V3<=end_1,]
1300
  			end_1 = max(tmp_c2c$V3)
1301
      } else {
1302
  			tmp_c2c = c2c[c2c$V2<=end_1,]
1303
  			end_1 = max(tmp_c2c$V2)
1304
      }
1305
			if (reverse) {
1306
		    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1307
			} else {
1308
			  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1309
			}
1310
			if (length(tmptransfostop$V8) == 0) {
1311
				if (!is.null(big_cur)) {
1312
					return(NULL)
1313
				  tmptransfostop = c2c[c2c$V3>=big_cur$end & c2c$V2<=big_cur$end,]
1314
				} else {
1315
					print(tmptransfostart)
1316
					print(tmptransfostop)
1317
					stop("Never happend condition 2.")
1318
				}
1319
			}
1320
		}
1321
		if (length(tmptransfostart$V8) != 1) {
1322
			# print("many start")
1323
			tmptransfostart = tmptransfostart[tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,]
1324
			if (length(tmptransfostart$V8) != 1) {
1325
				print(tmptransfostart)
1326
				print(tmptransfostop)
1327
  			stop("Never happend condition 3.")
1328
			}
1329
		}
1330
		if (length(tmptransfostop$V8) != 1) {
1331
			# print("many stop")
1332
		  tmptransfostop = tmptransfostop[tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,]
1333
			if (length(tmptransfostop$V8) != 1) {
1334
				print(tmptransfostart)
1335
				print(tmptransfostop)
1336
  			stop("Never happend condition 4.")
1337
			}
1338
		}
1339
		if (tmptransfostart$V7 != tmptransfostop$V7) {
1340
			print(tmptransfostart)
1341
			print(tmptransfostop)
1342
 			stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.")
1343
		}
1344
	}
1345
  
1346
  # Deal with strand
1347
  if (tmptransfostart$V8 == 1) {
1348
    begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2)
1349
    end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2)
1350
  } else {
1351
    begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2)
1352
    end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2)
1353
  }
1354
	# Build returned roi
1355
	roi$strain_ref = strain2
1356
	if (roi$strain_ref == "BY") {
1357
		roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]]
1358
	} else {
1359
		roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]]
1360
	}
1361
  roi$begin = begin_2
1362
  roi$end = end_2
1363
	if (sign(roi$end - roi$begin) == 0) {
1364
		roi$length = 1
1365
	} else {
1366
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1
1367
	}
1368
  return(roi)
1369
}, ex=function(){
1370
	# Define new translate_cur function...
1371
	translate_cur = function(roi, strain2, config) {
1372
		strain1 = roi$strain_ref
1373
		if (strain1 == strain2) {
1374
			return(roi)
1375
		} else {
1376
		  stop("Here is my new translate_cur function...")
1377
		}
1378
	}
1379
	# Binding it by uncomment follwing lines.
1380
	# unlockBinding("translate_cur", as.environment("package:nm"))
1381
	# unlockBinding("translate_cur", getNamespace("nm"))
1382
	# assign("translate_cur", translate_cur, "package:nm")
1383
	# assign("translate_cur", translate_cur, getNamespace("nm"))
1384
	# lockBinding("translate_cur", getNamespace("nm"))
1385
	# lockBinding("translate_cur", as.environment("package:nm"))
1386
})
1387

    
1388

    
1389
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
1390
### CURs are regions that can be aligned between the genomes
1391
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR
1392
min_cur_width = 4000, ##<< The minimum width of a CUR
1393
config = NULL ##<< GLOBAL config variable
1394
) {
1395

    
1396
  check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) {
1397
    c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1398
    check_homogeneity(c2c)
1399
  	if (length(c2c[,1]) == 0 ) {
1400
      stop("WARNING! checking overlapping for a region corresponding to an empty c2c.")
1401
    } else {
1402
  		lower_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; min(l$V2, l$V3)})
1403
  		upper_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; max(l$V2, l$V3)})
1404
  		tmp_index = order(lower_bounds)
1405
      lower_bounds = lower_bounds[tmp_index]
1406
      upper_bounds = upper_bounds[tmp_index]
1407
      tmp_diff = lower_bounds[-1] - upper_bounds[-length(upper_bounds)]
1408
      ov_index = which(tmp_diff < 0)
1409
      if(length(ov_index < 0) !=0 ) {
1410
        ov_index = ov_index[1]
1411
        print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ", c2c[1,]$V1, sep=""))
1412
        c2c_corrupted = c2c[tmp_index,][c(ov_index, ov_index + 1),]
1413
        print(c2c_corrupted)
1414
        return(list(lower_bounds[ov_index+1] - 1, upper_bounds[ov_index] + 1))
1415
      }
1416
      return(NULL)
1417
    }
1418
  }
1419

    
1420
  check_homogeneity = function(sub_c2c) {
1421
    tmp_signs = sign(sub_c2c$V2 - sub_c2c$V3)
1422
    tmp_signs = tmp_signs[tmp_signs != 0]
1423
  	if (sum(tmp_signs[1]  != tmp_signs)) {
1424
  		print(paste("*************** ERROR, non homogenous region (sign)! ********************"))
1425
      print(tmp_signs)
1426
  	}
1427
    tmp_signs2 = sign(sub_c2c$V9 - sub_c2c$V10)
1428
    tmp_signs2 = tmp_signs2[tmp_signs2 != 0]
1429
  	if (sum(tmp_signs2[1]  != tmp_signs2)) {
1430
  		print(paste("*************** ERROR, non homogenous region (sign2)! ********************"))
1431
      print(tmp_signs2)
1432
  	}
1433
  	if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) {
1434
  		print("*************** ERROR, non homogenous region chrs or V8! ********************")
1435
  	}
1436
  }
1437

    
1438
  test_and_squeeze_rois = function(foo, config=NULL) {
1439
    is_it_ok = function(list1, list2) {
1440
      bar = cbind(list1$begin, list2$begin, abs(list1$begin - list2$begin), list1$end, list2$end, abs(list1$end - list2$end), list1$length, list2$length, abs(list1$length - list2$length))
1441
      ok = length(bar[bar[,3] != 0 | bar[,6] != 0, ]) == 0
1442
      if (!ok) {
1443
        print(bar[bar[,3] != 0 | bar[,6] != 0, ])
1444
      }
1445
      return (ok)
1446
    }
1447
    squeeze_rois = function(list1, list2) {
1448
      rois = apply(t(1:nrow(list1)), 2, function(i){
1449
        roi = list1[i,]
1450
        roi2 = list2[i,]
1451
        roi$begin = max(roi$begin, roi2$begin)
1452
        roi$end = min(roi$end, roi2$end)
1453
        roi$length =  roi$end - roi$begin + 1
1454
        return(roi)
1455
      })
1456
      return(do.call(rbind, rois))
1457
    }
1458
    # foo_orig = compute_inter_all_strain_curs2(config=config)
1459
    # foo = foo_orig
1460
    STOP = FALSE
1461
    nb_round = 0
1462
    while(!STOP) {
1463
      nb_round = nb_round + 1
1464
      print(paste("2-2 round #", nb_round, sep=""))
1465
      fooby = translate_curs(foo, "BY", config=config)
1466
      fooyjm = translate_curs(foo, "YJM", config=config)
1467
      fooyjmby = translate_curs(fooyjm, "BY", config=config)
1468
      if (!is_it_ok(fooby, fooyjmby)) {
1469
        print("case 1")
1470
        foo = squeeze_rois(fooby, fooyjmby)
1471
    		next
1472
      }
1473
      foorm = translate_curs(foo, "RM", config=config)
1474
      foormby = translate_curs(foorm, "BY", config=config)
1475
      if (!is_it_ok(fooby, foormby)) {
1476
        print("case 2")
1477
        foo = squeeze_rois(fooby, foormby)
1478
    		next
1479
      }
1480
      fooyjmrm = translate_curs(fooyjm, "RM", config=config)
1481
      fooyjmrmyjm = translate_curs(fooyjmrm, "YJM", config=config)
1482
      if (!is_it_ok(fooyjm, fooyjmrmyjm)) {
1483
        print("case 3")
1484
        foo = squeeze_rois(fooyjm, fooyjmrmyjm)
1485
        next
1486
      }
1487
      foormyjm = translate_curs(foorm, "YJM", config=config)
1488
      foormyjmrm = translate_curs(foormyjm, "RM", config=config)
1489
      if (!is_it_ok(foorm, foormyjmrm)) {
1490
        print("case 4")
1491
        foo = squeeze_rois(foorm, foormyjmrm)
1492
        next
1493
      }
1494
      foo = translate_curs(foo, "BY", config=config)
1495
      STOP = TRUE
1496
    }
1497
    STOP = FALSE
1498
    nb_round = 0
1499
    while(!STOP) {
1500
      nb_round = nb_round + 1
1501
      print(paste("3-3 round #", nb_round, sep=""))
1502
      fooby = translate_curs(foo, "BY", config=config)
1503
      foobyrm = translate_curs(fooby, "RM", config=config)
1504
      foobyrmyjm = translate_curs(foobyrm, "YJM", config=config)
1505
      foobyrmyjmby = translate_curs(foobyrmyjm, "BY", config=config)
1506
      if (!is_it_ok(fooby, foobyrmyjmby)) {
1507
        print("case 1")
1508
        foo = squeeze_rois(fooby, foobyrmyjmby)
1509
      }
1510
      fooby = translate_curs(foo, "BY", config=config)
1511
      foobyyjm = translate_curs(fooby, "YJM", config=config)
1512
      foobyyjmrm = translate_curs(foobyyjm, "RM", config=config)
1513
      foobyyjmrmby = translate_curs(foobyyjmrm, "BY", config=config)
1514
      if (!is_it_ok(fooby, foobyyjmrmby)) {
1515
        print("case 2")
1516
        foo = squeeze_rois(fooby, foobyyjmrmby)
1517
        next
1518
      }
1519
      foo = translate_curs(foo, "BY", config=config)
1520
      STOP = TRUE
1521
    }
1522
    print("end")
1523
    return(foo)
1524
  }
1525

    
1526
  get_inter_strain_rois = function(strain1, strain2, diff_allowed = 30, min_cur_width = 200, config=NULL) {
1527
    c2c = c2c_extraction(strain1, strain2, config=config)
1528
    # computing diffs
1529
    diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)]
1530
    diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)]
1531
    # Filtering
1532
  	indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed)
1533
  	indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1))
1534
    rois = apply(t(1:length(indexes_start)), 2, function(i) {
1535
      if ( i %% 20 == 1) print(paste(i, "/", length(indexes_start)))
1536
      returned_rois = NULL
1537
  		start = indexes_start[i]
1538
  		stop = indexes_stop[i]
1539
  		sub_c2c = c2c[start:stop,]
1540
      check_homogeneity(sub_c2c)
1541
  		if (strain1 == "BY") {
1542
  			chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]]
1543
  		} else {
1544
  			chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]]
1545
  		}
1546
  		roi = list(chr=chr, begin=min(c(sub_c2c$V2,sub_c2c$V3)), end=max(c(sub_c2c$V2,sub_c2c$V3)), strain_ref=strain1)
1547
  		roi$length = roi$end - roi$begin + 1
1548
  		if (roi$length >= min_cur_width) {
1549
  			lower_bound = roi$begin
1550
  			upper_bound = roi$end
1551
        check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1552
        while(!is.null(check)) {
1553
          # print(check)
1554
      		roi1 = roi
1555
          roi1$end = check[[1]]
1556
      		roi1$length = roi1$end - roi1$begin + 1
1557
      		if (roi1$length >= min_cur_width) {
1558
            returned_rois = dfadd(returned_rois,roi1)
1559
          }
1560
          roi$begin = check[[2]]
1561
      		roi$length = roi$end - roi$begin + 1
1562
      		if (roi$length >= min_cur_width) {
1563
      			lower_bound = min(roi$begin, roi$end)
1564
      			upper_bound = max(roi$begin, roi$end)
1565
            check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1566
          } else {
1567
            check = NULL
1568
            roi = NULL
1569
          }
1570
        }
1571
        returned_rois = dfadd(returned_rois,roi)
1572
  	  }
1573
    })
1574
    rois = do.call(rbind,rois)
1575
    rois = rois[order(as.numeric(rois$chr), rois$begin), ]
1576
  	return(rois)
1577
  }
1578

    
1579
  translate_curs = function(rois, target_strain, config) {
1580
    tr_rois = apply(t(1:nrow(rois)), 2, function(i){
1581
      roi = rois[i,]
1582
      tr_roi = translate_cur(roi, target_strain, config=config)  
1583
      tmp_begin = min(tr_roi$begin, tr_roi$end)
1584
      tmp_end = max(tr_roi$begin, tr_roi$end)
1585
      tr_roi$begin = tmp_begin
1586
      tr_roi$end = tmp_end
1587
      tr_roi$length =  tr_roi$end - tr_roi$begin + 1
1588
      return(tr_roi)
1589
    })
1590
    tr_rois = do.call(rbind, tr_rois)
1591
    return(tr_rois)
1592
  }
1593

    
1594
  combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM"))
1595
  rois = list()
1596
  for (combi in combis) {
1597
    strain1 = combi[1]
1598
    strain2 = combi[2]
1599
    print(paste(strain1, strain2))
1600
    rois_fwd = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1601
    strain1 = combi[2]
1602
    strain2 = combi[1]
1603
    print(paste(strain1, strain2))
1604
    rois_rev = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1605
    tr_rois_rev = translate_curs(rois_rev, combi[1], config)      
1606
    region1 = rois_fwd
1607
    region2 = tr_rois_rev
1608
    rois[[paste(combi[1], combi[2], sep="_")]] = intersect_region(rois_fwd, tr_rois_rev)
1609
  }
1610
  reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]])
1611
  reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ]
1612
  tr_reducted_1_rois = translate_curs(reducted_1_rois, "RM", config)  
1613
  reducted_2_rois = intersect_region(tr_reducted_1_rois, rois[["RM_YJM"]])
1614
  reducted_2_rois = reducted_2_rois[reducted_2_rois$length >= min_cur_width, ]
1615
  reducted_rois = translate_curs(reducted_2_rois, "BY", config)  
1616
  reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ]
1617
  squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config)
1618
  return (squeezed_rois)
1619
}
1620

    
1621
intersect_region = function(# Returns the intersection of 2 list on regions.
1622
### This function...
1623
region1, ##<< Original regions.
1624
region2 ##<< Regions to intersect.
1625
) {
1626
  intersection = apply(t(1:nrow(region1)), 2, function(i) {
1627
    roi1 = region1[i, ]
1628
    sub_regions2 = region2[region2$chr == roi1$chr, ]
1629
    sub_regions2 = sub_regions2[roi1$begin <= sub_regions2$begin & sub_regions2$begin <= roi1$end |
1630
                                roi1$begin <= sub_regions2$end & sub_regions2$end <= roi1$end |
1631
                                sub_regions2$begin < roi1$begin  & roi1$end < sub_regions2$end
1632
                                , ]
1633
    if (nrow(sub_regions2) == 0) {
1634
      print("removing a region")
1635
      return(NULL)
1636
    } else if (nrow(sub_regions2) > 1) {
1637
      print("more than one region in intersect_region")          
1638
      return(do.call(rbind, apply(t(1:nrow(sub_regions2)), 2, function(i) {intersect_region(roi1, sub_regions2[i,])})))
1639
    } else {
1640
      roi2 = sub_regions2[1,]
1641
      if (roi1$begin < roi2$begin) {
1642
        print("not the same begin")
1643
        roi1$begin = roi2$begin
1644
        roi1$length =  roi1$end - roi1$begin + 1
1645
      }
1646
      if (roi1$end > roi2$end) {
1647
        print("not the same end")
1648
        roi1$end = roi2$end
1649
        roi1$length =  roi1$end - roi1$begin + 1
1650
      }
1651
      return(roi1)
1652
    }         
1653
  })
1654
  return(do.call(rbind,intersection))
1655
}
1656

    
1657
build_replicates = structure(function(# Stage replicates data
1658
### This function loads in memory data corresponding to the given experiments.
1659
expe, ##<< a list of vector corresponding to vector of replicates.
1660
roi, ##<< the region that we are interested in.
1661
only_fetch=FALSE, ##<< filter or not inputs.
1662
get_genome=FALSE,##<< Load or not corresponding genome.
1663
all_samples, ##<< Global list of samples.
1664
config=NULL ##<< GLOBAL config variable.
1665
) {
1666
  build_samples = function(samples_ids, roi, only_fetch=FALSE, get_genome=TRUE, get_ouputs=TRUE, all_samples) {
1667
  	samples=list()
1668
  	for (i in samples_ids) {
1669
  		sample = as.list(all_samples[all_samples$id==i,])
1670
      sample$orig_roi = roi
1671
      sample$roi = translate_cur(roi, sample$strain, config = config)
1672
  		if (get_genome) {
1673
  			# Get Genome
1674
  			fasta_ref_filename = config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]]
1675
  			sample$roi$genome = get_content(fasta_ref_filename, "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1676
  		}
1677
  		# Get inputs
1678
  		sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1679
  		sample$inputs = get_content(sample_inputs_filename, "table", stringsAsFactors=FALSE)
1680
  		sample$total_reads = sum(sample$inputs[,4])
1681
  		if (!only_fetch) {
1682
  		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300, filter_for_coverage=TRUE)
1683
  	  }
1684
  	  # Get TF outputs for Mnase_Seq samples
1685
  		if (sample$marker == "Mnase_Seq" & get_ouputs) {
1686
  			sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")
1687
  			sample$outputs = get_content(sample_outputs_filename, "table", header=TRUE, sep="\t")
1688
  			if (!only_fetch) {
1689
  	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1690
    		}
1691
  		}
1692
  		samples[[length(samples) + 1]] = sample
1693
  	}
1694
  	return(samples)
1695
  }
1696
	replicates = list()
1697
	for(samples_ids in expe) {
1698
		samples = build_samples(samples_ids, roi, only_fetch=only_fetch, get_genome=get_genome, all_samples=all_samples)
1699
		replicates[[length(replicates) + 1]] = samples
1700
	}
1701
	return(replicates)
1702
  }, ex = function() {
1703
    # library(rjson)
1704
    # library(nucleominer)
1705
    #
1706
    # # Read config file
1707
    # json_conf_file = "nucleo_miner_config.json"
1708
    # config = fromJSON(paste(readLines(json_conf_file), collapse=""))
1709
    # # Read sample file
1710
    # all_samples = get_content(config$CSV_SAMPLE_FILE, "cvs", sep=";", head=TRUE, stringsAsFactors=FALSE)
1711
    # # here are the sample ids in a list
1712
    # expes = list(c(1))
1713
    # # here is the region that we wnt to see the coverage
1714
    # cur = list(chr="8", begin=472000, end=474000, strain_ref="BY")
1715
    # # it displays the corverage
1716
    # replicates = build_replicates(expes, cur, all_samples=all_samples, config=config)
1717
    # out = watch_samples(replicates, config$READ_LENGTH,
1718
    #       plot_coverage = TRUE,
1719
    #       plot_squared_reads = FALSE,
1720
    #       plot_ref_genome = FALSE,
1721
    #       plot_arrow_raw_reads = FALSE,
1722
    #       plot_arrow_nuc_reads = FALSE,
1723
    #       plot_gaussian_reads = FALSE,
1724
    #       plot_gaussian_unified_reads = FALSE,
1725
    #       plot_ellipse_nucs = FALSE,
1726
    #       plot_wp_nucs = FALSE,
1727
    #       plot_wp_nuc_model = FALSE,
1728
    #       plot_common_nucs = FALSE,
1729
    #       height = 50)
1730
  })
1731

    
1732

    
1733

    
1734

    
1735

    
1736

    
1737

    
1738

    
1739

    
1740

    
1741

    
1742

    
1743

    
1744

    
1745

    
1746

    
1747

    
1748

    
1749

    
1750

    
1751

    
1752

    
1753

    
1754

    
1755

    
1756

    
1757

    
1758

    
1759

    
1760

    
1761

    
1762

    
1763

    
1764

    
1765

    
1766

    
1767

    
1768

    
1769

    
1770

    
1771

    
1772

    
1773

    
1774

    
1775

    
1776

    
1777

    
1778

    
1779

    
1780

    
1781

    
1782

    
1783

    
1784

    
1785

    
1786

    
1787

    
1788

    
1789

    
1790

    
1791

    
1792

    
1793

    
1794

    
1795

    
1796

    
1797

    
1798

    
1799

    
1800

    
1801

    
1802

    
1803

    
1804

    
1805

    
1806

    
1807

    
1808

    
1809

    
1810

    
1811

    
1812

    
1813

    
1814
watch_samples = function(# Watching analysis of samples
1815
### This function allows to view analysis for a particuler region of the genome.
1816
replicates, ##<< replicates under the form...
1817
read_length, ##<< length of the reads
1818
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome.
1819
plot_arrow_raw_reads = TRUE,  ##<< Plot (or not) arrows for raw reads.
1820
plot_arrow_nuc_reads = TRUE,  ##<< Plot (or not) arrows for reads aasiocied to a nucleosome.
1821
plot_squared_reads = TRUE,  ##<< Plot (or not) reads in the square fashion.
1822
plot_coverage = FALSE,  ##<< Plot (or not) reads in the covergae fashion. fashion.
1823
plot_gaussian_reads = TRUE,  ##<< Plot (or not) gaussian model of a F anf R reads.
1824
plot_gaussian_unified_reads = TRUE,  ##<< Plot (or not) gaussian model of a nuc.
1825
plot_ellipse_nucs = TRUE,  ##<< Plot (or not) ellipse for a nuc.
1826
change_col = TRUE, ##<< Change the color of each nucleosome.
1827
plot_wp_nucs = TRUE,  ##<< Plot (or not) cluster of nucs
1828
plot_fuzzy_nucs = TRUE,  ##<< Plot (or not) cluster of fuzzy
1829
plot_wp_nuc_model = TRUE,  ##<< Plot (or not) gaussian model for a cluster of nucs
1830
plot_common_nucs = FALSE,  ##<< Plot (or not) aligned reads.
1831
plot_common_unrs = FALSE,  ##<< Plot (or not) unaligned nucleosomal refgions (UNRs).
1832
plot_wp_nucs_4_nonmnase = FALSE,  ##<< Plot (or not) clusters for non inputs samples.
1833
plot_chain = FALSE,  ##<< Plot (or not) clusterised nuceosomes between mnase samples.
1834
plot_sample_id = FALSE, ##<<  Plot (or not) the sample id for each sample.
1835
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed.
1836
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed.
1837
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis.
1838
main=NULL, ##<< main title of the produced plot
1839
xlab=NULL, ##<< xlab of the produced plot
1840
ylab="#reads (per million reads)", ##<< ylab of the produced plot
1841
config=NULL ##<< GLOBAL config variable
1842
){
1843
  returned_list = list()
1844
  # Computing global display parameters
1845
  if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) {
1846
	  x_min_glo = replicates[[1]][[1]]$roi[["begin"]]
1847
	  x_max_glo = replicates[[1]][[1]]$roi[["end"]]
1848
  } else {
1849
	  x_min_glo = - replicates[[1]][[1]]$roi[["begin"]]
1850
	  x_max_glo = - replicates[[1]][[1]]$roi[["end"]]
1851
  }
1852
	base_glo = 0
1853
	nb_rank_glo = 0
1854
  for (samples in replicates) {
1855
  	nb_rank_glo = nb_rank_glo + length(samples)
1856
  }
1857
	ylim_glo = c(base_glo, base_glo + height * nb_rank_glo)
1858
	y_min_glo = min(ylim_glo)
1859
	y_max_glo = max(ylim_glo)
1860
  delta_y_glo = y_max_glo - y_min_glo
1861
  # Plot main frame
1862
  if (is.null(xlab)) {
1863
    xlab = paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr)
1864
  }
1865
  plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab=ylab, xlab=xlab, main=main )
1866
  axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2))
1867
  # Go
1868
	replicates_wp_nucs = list()
1869
  wp_maps = list()
1870
  fuzzy_maps = list()
1871
  for (replicate_rank in 1:length(replicates)) {
1872
		# Computing replicate parameters
1873
		nb_rank = length(samples)
1874
		base = (replicate_rank-1) * height * nb_rank
1875
		ylim = c(base, base + height * nb_rank)
1876
		y_min = min(ylim)
1877
		y_max = max(ylim)
1878
	  delta_y = y_max - y_min
1879
		samples = replicates[[replicate_rank]]
1880
		for (sample_rank in 1:length(samples)) {
1881
			# computing sample parameters
1882
			sample = samples[[sample_rank]]
1883
			y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank
1884
      if (plot_sample_id) {
1885
  			text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep=""))
1886
      }
1887

    
1888
		  if (sample$roi[["begin"]] < sample$roi[["end"]]) {
1889
			  x_min = sample$roi[["begin"]]
1890
			  x_max = sample$roi[["end"]]
1891
		  } else {
1892
			  x_min = - sample$roi[["begin"]]
1893
			  x_max = - sample$roi[["end"]]
1894
		  }
1895
			shift = x_min_glo - x_min
1896
	    # Plot Genome seq
1897
			if (plot_ref_genome) {
1898
		  	text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier")
1899
		  }
1900
			# Plot reads
1901
			reads = sample$inputs
1902
			signs = sign_from_strand(reads[,3])
1903
			if (plot_arrow_raw_reads) {
1904
				arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev,
1905
				col=1, length=0.15/nb_rank)
1906
			}
1907
	    if (plot_squared_reads) {
1908
        # require(plotrix)
1909
				rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift,  y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0)
1910
			}
1911
	    if (plot_coverage) {
1912
        if (length(reads[,1]) != 0) {
1913
          step_h = sign(x_min) * signs * reads[,4]
1914
          step_b = sign(x_min) * reads[,2] + shift
1915
          step_e = sign(x_min) * (reads[,2] + signs * 150) + shift
1916
          steps_x = min(step_b, step_e):max(step_b, step_e)
1917
          steps_y = rep(0, length(steps_x))
1918
          for (i in 1:length(step_h)) {
1919
            steps_y[which(steps_x==min(step_b[i], step_e[i]))] =  steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i])
1920
            steps_y[which(steps_x==max(step_b[i], step_e[i]))] =  steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i])
1921
          }
1922
          tmp_index = which(steps_y != 0)
1923
          steps_x = steps_x[tmp_index]
1924
          steps_y = steps_y[tmp_index]
1925
          tmp_current_level = 0
1926
          for (i in 1:length(steps_y)) {
1927
            steps_y[i] = tmp_current_level + steps_y[i]
1928
            tmp_current_level = steps_y[i]
1929
          }
1930
          steps_y = c(0, steps_y)
1931
          steps_y = steps_y * 1000000/sample$total_reads
1932
        } else {
1933
          steps_y = c(0, 0, 0)
1934
          steps_x = c(x_min, x_max)
1935
        }
1936
        # print(steps_x)
1937
        # print(steps_y)
1938
        lines(stepfun(steps_x, steps_y + y_lev), pch="")
1939
        abline(y_lev,0)
1940
        returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y)
1941
			}
1942
			# Plot nucs
1943
	    if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) {
1944
				nucs = sample$outputs
1945
				if (length(nucs$center) > 0) {
1946
					col = 1
1947
		      for (i in 1:length(nucs$center)) {
1948
            foo<<-nucs
1949
            if (change_col) {
1950
  						col = col + 1
1951
              } else {
1952
                col = "blue"
1953
              }
1954
		        nuc = nucs[i,]
1955
						involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width)
1956
				  	involved_signs = apply(t(involved_reads[,3]), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
1957
						total_involved_reads = sum(involved_reads[,4])
1958
						if (plot_arrow_nuc_reads ) {
1959
							arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev,
1960
							col=col, length=0.15/nb_rank)
1961
						}
1962
	          if (plot_gaussian_reads | plot_gaussian_unified_reads) {
1963
  						flatted_reads = flat_reads(involved_reads, nuc$width)
1964
	  					delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
1965
		  			}
1966
	          if (plot_gaussian_reads ) {
1967
							flatted_reads = flat_reads(involved_reads, nuc$width)
1968
							delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
1969
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) * height/5 + y_lev, col=col)
1970
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) * height/5 + y_lev, col=col)
1971
	          }
1972
	          if (plot_gaussian_unified_reads ) {
1973
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) * height/5 + y_lev, col=col, lty=1)
1974
	          }
1975
	          if (plot_ellipse_nucs) {
1976
				      # require(plotrix)
1977
	  	 				draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width * height/5, border=col)
1978
						}
1979
		      }
1980
		    } else {
1981
		      print("WARNING! No nucs to print.")
1982
		    }
1983
			}
1984
	  }
1985

    
1986
	  # Plot wp nucs
1987
		if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs | plot_chain)) {
1988
			if (samples[[1]]$marker == "Mnase_Seq") {
1989
				if (is.null(aggregated_intra_strain_nucs)) {
1990
	  			wp_nucs = aggregate_intra_strain_nucs(samples)[[1]]
1991
				} else {
1992
					wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]]
1993
				}
1994
		  } else {
1995
  			wp_nucs = replicates_wp_nucs[[replicate_rank-2]]
1996
		  }
1997
      if (plot_chain) {
1998
        tf_nucs = lapply(wp_nucs, function(nuc) {
1999
          bar = apply(t(nuc$nucs), 2, function(tmp_nuc){
2000
            tmp_nuc = tmp_nuc[[1]]
2001
            tmp_nuc$inputs = NULL
2002
            tmp_nuc$original_reads = NULL
2003
            tmp_nuc$wp = nuc$wp
2004
            # print(tmp_nuc)
2005
            return(tmp_nuc)
2006
          })
2007
          return(do.call(rbind, bar))
2008
        })
2009
        tf_nucs = data.frame(do.call(rbind, tf_nucs))
2010
        tmp_x = (unlist(tf_nucs$lower_bound) + unlist(tf_nucs$upper_bound)) / 2
2011
        tmp_y =  y_min + (unlist(tf_nucs$track) - 0.5) * delta_y/nb_rank
2012
        tmp_y_prev = tmp_y[-length(tmp_y)]
2013
        tmp_y_next = tmp_y[-1]
2014
        tmp_y_inter = (tmp_y_prev + tmp_y_next) / 2
2015
        tmp_track = unlist(tf_nucs$track)
2016
        tmp_track_prev = tmp_track[-length(tmp_track)]
2017
        tmp_track_next = tmp_track[-1]
2018
        # tmp_track_inter = signif(tmp_track_prev - tmp_track_next) * (abs(tmp_track_prev - tmp_track_next) > 1) * 25
2019
        if (is.null(config$TRACK_LOD_OFFSET)) {
2020
          config$TRACK_LOD_OFFSET = 0
2021
        }
2022
        tmp_track_inter = signif(tmp_track_prev - tmp_track_next) + config$TRACK_LOD_OFFSET * 25
2023
        tmp_x_prev = tmp_x[-length(tmp_x)]
2024
        tmp_x_next = tmp_x[-1]
2025
        need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50})
2026
        tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift
2027
        tmp_llr_inter =signif(unlist(tf_nucs$llr_score)[-1], 2)
2028
        new_tmp_x = c()
2029
        new_tmp_y = c()
2030
        index_odd = 1:length(tmp_x) * 2 - 1
2031
        index_even = (1:(length(tmp_x) - 1)) * 2
2032
        new_tmp_x[index_odd] = tmp_x
2033
        new_tmp_y[index_odd] = tmp_y
2034
        new_tmp_x[index_even] = tmp_x_inter
2035
        new_tmp_y[index_even] = tmp_y_inter
2036
        lines(new_tmp_x , new_tmp_y, lwd=2)
2037
        points(tmp_x, tmp_y, cex=4, pch=16, col="white")
2038
        points(tmp_x, tmp_y, cex=4, lwd=2)
2039
        text(tmp_x, tmp_y, 1:nrow(tf_nucs))
2040
        if (is.null(config$LEGEND_LOD_POS)) {
2041
          pos = 2
2042
        } else {
2043
          pos = config$LEGEND_LOD_POS
2044
        }
2045
        col_llr = sapply(tmp_llr_inter, function(llr){if (llr < 20 ) return("green") else return("red")})
2046
        text(tmp_x_inter, tmp_y_inter, tmp_llr_inter, cex=1.5, pos=pos, col=col_llr) 
2047
        
2048
      }
2049

    
2050
      if (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs ) {
2051
    		replicates_wp_nucs[[replicate_rank]] = wp_nucs
2052
        strain = samples[[1]]$strain
2053
        wp_maps[[strain]] = flat_aggregated_intra_strain_nucs(wp_nucs, "foo")
2054
        fuzzy_maps[[strain]] = get_intra_strain_fuzzy(wp_maps[[strain]], as.list(samples[[1]]$roi), samples[[1]]$strain, config=config)
2055

    
2056
        if (plot_fuzzy_nucs) {
2057
          fuzzy_map = fuzzy_maps[[strain]]
2058
          if (!is.null(fuzzy_map)) {
2059
            if (nrow(fuzzy_map) > 0) {
2060
              rect(sign(x_min) * fuzzy_map$lower_bound + shift, y_min, sign(x_min) * fuzzy_map$upper_bound + shift, y_max, col=adjustcolor(3, alpha.f = 0.1), border=1)                    
2061
            }
2062
          }
2063
        } 
2064

    
2065
        if (plot_wp_nucs) {
2066
    			for (wp_nuc in wp_nucs) {
2067
    				if (wp_nuc$wp){
2068
    					rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1)
2069
    					if (plot_wp_nuc_model) {
2070
      					all_original_reads = c()
2071
      					for(initial_nuc in wp_nuc$nucs) {
2072
      						all_original_reads = c(all_original_reads, initial_nuc$original_reads)
2073
      					}
2074
      					delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound
2075
    					  lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) * height/5 + y_min, col=1)
2076
    				  }
2077
  				  }
2078
  				}
2079
  			}
2080
      }
2081
		}
2082
	}
2083

    
2084
	if (plot_common_nucs) {
2085
    if (is.null(aligned_inter_strain_nucs)) {
2086
      aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]]
2087
      if (!is.null(aligned_inter_strain_nucs)) {
2088
        aligned_inter_strain_nucs$cur_index = "foo" 
2089
      }
2090
    }
2091

    
2092
    #Plot common wp nucs
2093
    mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list()
2094
    for (replicate_rank in 1:length(replicates)) {
2095
      nb_rank[[replicate_rank]] = length(samples)
2096
      base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]]
2097
      ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]])
2098
      y_min[[replicate_rank]] = min(ylim[[replicate_rank]])
2099
      y_max[[replicate_rank]] = max(ylim[[replicate_rank]])
2100
      delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]]
2101
      mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2
2102

    
2103
      samples = replicates[[replicate_rank]]
2104
      for (sample_rank in 1:length(samples)) {
2105
        sample = samples[[sample_rank]]
2106
        y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]]
2107
        if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2108
          x_min[[replicate_rank]] = sample$roi[["begin"]]
2109
          x_max[[replicate_rank]] = sample$roi[["end"]]
2110
        } else {
2111
          x_min[[replicate_rank]] = - sample$roi[["begin"]]
2112
          x_max[[replicate_rank]] = - sample$roi[["end"]]
2113
        }
2114
        shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]]
2115
      }
2116
    }
2117
    print(aligned_inter_strain_nucs)
2118
    if (!is.null(aligned_inter_strain_nucs)) {
2119
      for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
2120
        inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
2121
        tmp_xs = tmp_ys = c()
2122
        for (replicate_rank in 1:length(replicates)) {
2123
          samples = replicates[[replicate_rank]]
2124
          strain = samples[[1]]$strain
2125
          tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]])
2126
          tmp_ys = c(tmp_ys, mid_y[[replicate_rank]])
2127
        }
2128
        lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]])*8, cex=dev.size()[1]*400/(x_max[[1]]-x_min[[1]]), pch=19)
2129
      }
2130
    }
2131
    
2132
    if (plot_common_unrs) {
2133
      combi = c(replicates[[1]][[1]]$strain, replicates[[2]][[1]]$strain)
2134
      roi = as.list(samples[[1]]$roi)
2135
      cur_index = "foo"
2136
      common_nuc_results = list()
2137
      common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = aligned_inter_strain_nucs
2138
      unrs = get_unrs(combi, roi, cur_index, wp_maps, fuzzy_maps, common_nuc_results, config = config) 
2139
      rect(sign(x_min[[1]]) * unrs$lower_bound + shift[[1]], y_min[[1]], sign(x_min[[1]]) * unrs$upper_bound + shift[[1]], y_max[[2]], border=4, lwd=10, col=adjustcolor(4, alpha.f = 0.05))        
2140
    }
2141

    
2142
	}
2143
  return(returned_list)
2144
}
2145

    
2146

    
2147

    
2148
get_intra_strain_fuzzy = function(# Compute the fuzzy list for a given strain.
2149
### This function grabs the nucleosomes detxted by template_filter that have been rejected bt aggregate_intra_strain_nucs as well positions.
2150
wp_map, ##<< Well positionned nucleosomes map.
2151
roi, ##<< The region of interest.
2152
strain, ##<< The strain we want to extracvt the fuzzy map.
2153
config=NULL ##<< GLOBAL config variable.
2154
) {
2155
  fuzzy_map = wp_map[wp_map$wp==0, ]
2156
  if (nrow(fuzzy_map) > 0) {
2157
    fuzzy_map = substract_region(fuzzy_map, wp_map[wp_map$wp==1,])
2158
    if (!is.null(fuzzy_map)) {
2159
      fuzzy_map = union_regions(fuzzy_map)
2160
      fuzzy_map = crop_fuzzy(fuzzy_map, roi, strain, config)
2161
    }
2162
  }
2163
  return(fuzzy_map)
2164
}
2165

    
2166

    
2167

    
2168

    
2169

    
2170
get_unrs = function(# Compute the unaligned nucleosomal regions (UNRs).
2171
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting UNR. It will be take into account in the count read part og the pipeline.
2172
combi, ##<< The strain combination to consider.
2173
roi, ##<< The region of interest.
2174
cur_index, ##<< The region of interest index.
2175
wp_maps, ##<< Well positionned nucleosomes maps.
2176
fuzzy_maps, ##<< Fuzzy nucleosomes maps.
2177
common_nuc_results, ##<< Common wp nuc maps
2178
config=NULL ##<< GLOBAL config variable
2179
) {
2180
  # print(cur_index)
2181

    
2182
  tmp_combi_key = paste(combi[1], combi[2], sep="_")
2183
  tmp_common_nucs = common_nuc_results[[tmp_combi_key]]
2184
  tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$cur_index == cur_index, ]
2185

    
2186
  # print(paste("Dealing with unr from", combi[1]))
2187
  tmp_fuzzy = fuzzy_maps[[combi[1]]]
2188
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ]
2189
  tmp_wp = wp_maps[[combi[1]]]
2190
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2191
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2192
  # Let's go!
2193
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2194
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2195
      return(NULL)
2196
    } else {
2197
      return (index_nuc)
2198
    }
2199
  }))
2200
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2201
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2202
  if (length(tmp_unr) != 0) {
2203
    tmp_unr = union_regions(tmp_unr)
2204
  }
2205
  tmp_unr_nucs_1 = tmp_unr
2206
  if (length(tmp_unr_nucs_1[,1]) == 0) {return(NULL)}
2207
  agg_unr_1 = tmp_unr_nucs_1
2208

    
2209
  # print(paste("Dealing with unr from ", combi[2]))
2210
  tmp_fuzzy = fuzzy_maps[[combi[2]]]
2211
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ]
2212
  tmp_wp = wp_maps[[combi[2]]]
2213
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2214
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2215
  # Let's go!
2216
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2217
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2218
      return(NULL)
2219
    } else {
2220
      return (index_nuc)
2221
    }
2222
  }))
2223
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2224
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2225
  if (length(tmp_unr) != 0) {
2226
    tmp_unr = union_regions(tmp_unr)
2227
  }
2228
  tmp_unr_nucs_2 = tmp_unr
2229
  if (length(tmp_unr_nucs_2[,1]) == 0) {return(NULL)}
2230
  agg_unr_2 = crop_fuzzy(tmp_unr_nucs_2, roi, combi[2], config)
2231
  tr_agg_unr_2 = translate_regions(agg_unr_2, combi, cur_index, roi=roi, config=config)
2232
  tr_agg_unr_2 = union_regions(tr_agg_unr_2)
2233

    
2234
  # print("Dealing with unr from both...")
2235
  all_unr = union_regions(rbind(agg_unr_1, tr_agg_unr_2))
2236

    
2237
  # print(paste("Dealing with wp from", combi[1]))
2238
  tmp_wp = wp_maps[[combi[1]]]
2239
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2240
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2241
  # Let's go!
2242
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2243
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2244
      return (index_nuc)
2245
    } else {
2246
      return(NULL)
2247
    }
2248
  }))
2249
  wp_nucs_1 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2250
  
2251
  # print(paste("Dealing with wp from", combi[2]))
2252
  tmp_wp = wp_maps[[combi[2]]]
2253
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2254
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2255
  # Let's go!
2256
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2257
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2258
      return (index_nuc)
2259
    } else {
2260
      return(NULL)
2261
    }
2262
  }))
2263
  wp_nucs_2 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2264
  wp_nucs_2 = crop_fuzzy(wp_nucs_2, roi, combi[2], config)
2265
  if (nrow(wp_nucs_2) == 0) {
2266
    tr_wp_nucs_2 = wp_nucs_2
2267
  } else {
2268
    tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, cur_index, roi=roi, config=config)      
2269
  }
2270

    
2271
  # print("Dealing with wp from both...")
2272
  all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2))
2273

    
2274
  # print("Dealing with unr and wp...")
2275
  non_inter_unr = substract_region(all_unr, all_wp)
2276
  non_inter_unr = crop_fuzzy(non_inter_unr, roi, combi[1], config)
2277
  if (is.null(non_inter_unr)) { return(NULL) }
2278
  non_inter_unr$len = non_inter_unr$upper_bound - non_inter_unr$lower_bound
2279
  min_unr_width = 75
2280
  non_inter_unr = non_inter_unr[non_inter_unr$len >= min_unr_width,]
2281

    
2282
  non_inter_unr$index_nuc = 1:length(non_inter_unr[,1])
2283
  return (non_inter_unr)
2284
}
2285

    
2286

    
2287