Statistiques
| Branche: | Révision :

root / src / R / nucleominer.R @ 1d833b97

Historique | Voir | Annoter | Télécharger (85,06 ko)

1
# get_content = function(# Get content from cached 
2
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
3
# obj, ##<< The object that we want its content.
4
# ... ##<< Parameters that will be passed to my_read
5
# ) {
6
#   UseMethod("get_content", obj)
7
#   ### Returns the cached content of the current object. 
8
# }
9
# 
10
# get_content.default = structure( function(# Get content from cached 
11
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
12
# obj, ##<< The object that we want its content.
13
# ... ##<< Parameters that will be passed to my_read
14
# ) {
15
#   if (inherits(try(NM_CACHE,TRUE), "try-error") || is.null(NM_CACHE)) {
16
#     NM_CACHE <<- list()
17
#   }
18
#   if (is.null(NM_CACHE[[obj$filename]])) {
19
#     print(paste("Loading file ",obj$filename, sep=""))
20
#     tmp_content = my_read(obj, ...)
21
#     print("affect it...")
22
#     NM_CACHE[[obj$filename]] <<- tmp_content
23
#     print("done.")
24
#   }  
25
#   return(NM_CACHE[[obj$filename]])
26
# ### Returns the cached content of the current object. 
27
# }, ex=function(){
28
#   # Create a dataframe
29
#   df = NULL
30
#   df = dfadd(df, list(key1 = "value1", key2 = "value2"))
31
#   df = dfadd(df, list(key1 = "value1'", key2 = "value3'"))
32
#   # Dump it into tmp file
33
#   write.table(df, file="/tmp/tmp.dump.table.tmp")
34
#   # Load it into cache using feature.
35
#   cached_content_object  = list(filename="/tmp/tmp.dump.table.tmp")
36
#   class(cached_content_object) = "table"  
37
#   # First time it will be load into cache
38
#   print(get_content(cached_content_object))
39
#   # Second time not
40
#   print(get_content(cached_content_object))
41
#   
42
# })
43
#   
44
# my_read = function(
45
# ### Abstract my_read function.  
46
#   obj, ...) {
47
#   UseMethod("my_read", obj)
48
# }
49
#   
50
# my_read.default = function(
51
# ### Default my_read function.  
52
#   obj, ...){
53
#   stop(paste("ERROR, my_read is not defined for any Objects (file ", obj$filename," )", sep=""))      
54
# }
55
# 
56
# my_read.fasta = function(
57
# ### my_read function for fasta files.  
58
#   obj, ...){
59
#   # require(seqinr)
60
#   return(read.fasta(obj$filename, ...))
61
# }
62
# 
63
# my_read.table = function(
64
# ### my_read function for table files.  
65
#   obj, ...){
66
#   if (rev(unlist(strsplit(obj$filename, ".", fixed=TRUE)))[1] == "gz") {
67
#     return(read.table(file=gzfile(obj$filename), ...))      
68
#   } else { 
69
#     return(read.table(file=obj$filename, ...))
70
#   }
71
# }  
72
# 
73
# my_read.cvs = function(
74
# ### my_read function for cvs files.  
75
#   obj, ...){
76
#   return(read.csv(file=obj$filename, ...))
77
# }  
78

    
79

    
80

    
81

    
82

    
83

    
84

    
85

    
86

    
87
FDR = structure(function#  False Discovery Rate
88
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper
89
##author<< Gael Yvert, 
90
(
91
x, ##<< A vector x of independent p-values.
92
FDR ##<< The specified FDR.
93
) {
94
  x <- sort(na.omit(x))
95
  N = length(x)
96
  i = 1;
97
  while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR
98
  if (i == 1)
99
    return (NA)
100
  else
101
    return( x[i-1] )
102
### Return the the corresponding cutoff.
103
}, ex=function(){
104
  print("example")
105
})
106

    
107

    
108

    
109
lod_score_vecs = structure(function # Likelihood ratio
110
### Compute the likelihood log of two set of value from two models Vs. a unique model.
111
(
112
x ,##<< First vector.
113
y ##<< Second vector.
114
) {
115
	if (length(x) <=1 | length(y) <= 1) {
116
		return(NA)
117
	}
118
  meanX = mean(x)
119
  sdX = sd(x)
120
  meanY = mean(y)
121
  sdY = sd(y)
122
  meanXY = mean(c(x,y))
123
  sdXY = sd(c(x,y))
124
  llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
125
  llY = sum(log(dnorm(y,mean=meanY,sd=sdY)))
126
  llXY = sum(log(dnorm(c(x,y),mean=meanXY,sd=sdXY)))
127
  ratio = -(llX + llY - llXY)
128
  return(ratio)
129
### Returns the likelihood ratio.
130
}, ex=function(){
131
  # LOD score for 2 set of values
132
  mean1=5; sd1=2; card2 = 250
133
  mean2=6; sd2=3; card1 = 200
134
  x1 = rnorm(card1, mean1, sd1)
135
  x2 = rnorm(card2, mean2, sd2)  
136
  min = floor(min(c(x1,x2)))
137
  max = ceiling(max(c(x1,x2)))
138
  hist(c(x1,x2), xlim=c(min, max), breaks=min:max)
139
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
140
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
141
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
142
  lod_score_vecs(x1,x2)
143
 })
144

    
145
dfadd = structure(function# Adding list to a dataframe.
146
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}.
147
	(df, ##<<  A dataframe
148
		l ##<<  A list
149
	) {
150
  if (is.null(df)) {
151
    df = data.frame(l,stringsAsFactors=FALSE)
152
  } else {
153
    df = rbind(df, data.frame(l,stringsAsFactors=FALSE))
154
  }
155
  return(df)
156
### Return the dataframe \emph{df}.
157
}, ex=function(){
158
		## Here dataframe is NULL
159
		print(df)
160
		df = NULL
161
		
162
		# Initialize df
163
		df = dfadd(df, list(key1 = "value1", key2 = "value2"))
164
		print(df)
165
		
166
		# Adding elements to df
167
		df = dfadd(df, list(key1 = "value1'", key2 = "value2'"))
168
		print(df)
169
})
170

    
171

    
172
sign_from_strand = function(
173
### Get the sign of strand 
174
strands) {
175
	apply(t(strands), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
176
### If strand in forward then returns 1 else returns -1
177
}
178

    
179
flat_reads = function(
180
### Extract reads coordinates from TempleteFilter input sequence
181
reads, ##<< TemplateFilter input reads 
182
nuc_width ##<< Width used to shift F and R reads.
183
) {
184
	F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])}))
185
	R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])}))
186
	flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads))  )
187
	return(list(F_flatted_reads, R_flatted_reads, flatted_reads))
188
### Returns a list of F reads, R reads and joint/shifted F and R reads.
189
}
190

    
191

    
192

    
193
filter_tf_outputs = function(# Filter TemplateFilter outputs
194
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlap threshold.
195
tf_outputs, ##<< TemplateFilter outputs.
196
chr, ##<< Chromosome observed, here chr is an integer.
197
x_min, ##<< Coordinate of the first bp observed.
198
x_max, ##<< Coordinate of the last bp observed.
199
nuc_width = 160, ##<< Nucleosome width.
200
ol_bp = 59, ##<< Overlap Threshold.
201
corr_thres = 0.5 ##<< Correlation threshold.
202
) {
203
  if (x_min < 0) {
204
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center > (-x_max - nuc_width/2) & tf_outputs$center <  (-x_min + nuc_width/2),]
205
	} else {
206
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center > (x_min - nuc_width/2) & tf_outputs$center < (x_max + nuc_width/2),]
207
  }
208
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
209
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
210
  tf_outputs = tf_outputs[tf_outputs$correlation >= corr_thres,]  
211
  tf_outputs = tf_outputs[order(tf_outputs$correlation,decreasing=TRUE),]  
212
  i = 1
213
  while (i <= length(tf_outputs[,1])) {    
214
    lb = tf_outputs[i,]$low 
215
    ub = tf_outputs[i,]$up
216
    tf_outputs = tf_outputs[!(tf_outputs$low <= (ub-ol_bp) & tf_outputs$up > ub) & !(tf_outputs$up >= (lb+ol_bp) & tf_outputs$low < lb),]
217
    i = i+1
218
  }
219
  return(tf_outputs)
220
### Returns filtered TemplateFilter Outputs
221
}
222

    
223

    
224

    
225
filter_tf_inputs = function(# Filter TemplateFilter inputs
226
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
227
inputs, ##<< TF inputs to be filtered.
228
chr, ##<< Chromosome observed, here chr is an integer.
229
x_min, ##<< Coordinate of the first bp observed.
230
x_max, ##<< Coordinate of the last bp observed.
231
nuc_width = 160, ##<< Nucleosome width.
232
only_f = FALSE, ##<< Filter only F reads.
233
only_r = FALSE ##<< Filter only R reads.
234
) {
235
	if (only_f) {
236
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
237
	} else if (only_r) {
238
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]			
239
	} else {
240
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
241
	}
242
	corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
243
  inputs = inputs[inputs[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]
244
	return(inputs)
245
### Returns filtred inputs.
246
}
247

    
248

    
249

    
250
get_comp_strand = function(
251
### Compute the complementatry strand.
252
strand ##<< The original strand.
253
) {
254
	apply(t(strand),2, function(n){
255
	  if (n=="a") {return("t")} 
256
		if (n=="t") {return("a")}
257
		if (n=="c") {return("g")} 
258
		if (n=="g") {return("c")} 
259
	})
260
### Returns the complementatry strand.
261
}
262

    
263

    
264
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes. 
265
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{lod_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
266
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
267
lod_thres=-20, ##<< Log likelihood ration threshold.
268
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
269
){
270
	end_of_tracks = function(tracks) {
271
		if (length(tracks) == 0) {
272
			return(TRUE)
273
		}
274
	  for (lower_bound in tracks) {
275
			if(!is.na(lower_bound)) {
276
	      if (lower_bound < coord_max) {
277
	        return(FALSE)
278
	      }
279
	  	}
280
	  }
281
	  return(TRUE)
282
	}
283
	store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,nb_tracks, min_nuc_center, max_nuc_center) {
284
		if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) {
285
			new_cluster$wp = TRUE
286
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
287
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
288
		  	clusters[[length(clusters) + 1]] = new_cluster
289
				# print(new_cluster)
290
		  }
291
		} else {
292
			new_cluster$wp = FALSE
293
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
294
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
295
			  clusters[[length(clusters) + 1]] = new_cluster
296
			}
297
		}
298
		return(clusters)
299
	}
300
	strain = samples[[1]]$strain
301
	lod_scores = c()
302
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
303
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
304
  # compute clusters
305
  clusters = list()
306
  cluster_contents = list()
307
  # Init reader
308
  indexes = c()
309
  track_readers = c()
310
  current_nuc = NULL
311
	lod_score = lod_thres - 1 
312
  # Read nucs from TF outputs  
313
  tf_outs = list()
314
	i = 1
315
  for (sample in samples) {
316
		# print(sample$roi$chr)
317
		# print(min_nuc_center)
318
		# print(max_nuc_center)
319
		# print(sample$outputs)
320
		# tf_outs[[i]] = filter_tf_outputs(sample$outputs, sample$roi$chr, min_nuc_center, max_nuc_center)
321
		# print(tf_outs[[i]])
322
		tf_outs[[i]] = sample$outputs
323
		tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),]  
324
    indexes[i] = 1
325
		if (is.na(tf_outs[[i]][indexes[i],]$center)) {
326
      track_readers[i] = coord_max				
327
	  } else {
328
      track_readers[i] = tf_outs[[i]][indexes[i],]$center				
329
		}
330
		i = i+1		
331
  }
332
	# print(track_readers)
333
  new_cluster = NULL
334
  nb_nucs_in_cluster = 0
335
  nuc_from_track = c()
336
  for (i in 1:length(tf_outs)){
337
    nuc_from_track[i] = FALSE
338
  }
339
  # Start clustering
340
  while (!end_of_tracks(track_readers)) {
341
    new_center = min(track_readers)
342
		current_track = which(track_readers == new_center)[1]			
343
    new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],])
344
		new_nuc$chr = substr(new_nuc$chr,4,1000000L)
345
		new_nuc$inputs = samples[[current_track]]$inputs
346
		new_nuc$chr = samples[[current_track]]$roi$chr
347
		new_nuc$track = current_track
348
		
349
		new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width) 
350
		flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width)
351
		new_nuc$original_reads = flatted_reads[[3]]
352

    
353
    new_upper_bound = new_nuc$upper_bound
354

    
355
    if (!is.null(current_nuc)) {
356
			lod_score = lod_score_vecs(current_nuc$original_reads,new_nuc$original_reads)
357
			lod_scores = c(lod_scores,lod_score)
358
		}
359
		# print(paste(lod_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
360
		if (is.na(lod_score)) {
361
			lod_score = lod_thres - 1 
362
		}
363
	  if (lod_score > lod_thres) {
364
			# Store lod_score
365
			new_nuc$lod_score = lod_score 			
366
      # aggregate to current cluster
367
      #   update bound
368
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
369
        new_cluster$upper_bound = new_nuc$upper_bound      
370
      }
371
      if (new_nuc$lower_bound < new_cluster$lower_bound) {
372
        new_cluster$lower_bound = new_nuc$lower_bound         
373
      }
374
      #   add nucleosome to current cluster
375
      nuc_from_track[current_track] = TRUE
376
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1 
377
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
378
    } else {
379
			if (!is.null(new_cluster)) {
380
        # store old cluster
381
	      clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
382
			}
383
      # Reinit current cluster composition stuff
384
      nb_nucs_in_cluster = 0
385
      nuc_from_track = c()
386
      for (i in 1:length(tf_outs)){
387
        nuc_from_track[i] = FALSE
388
      }
389
      # create new cluster        
390
      new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list())
391
      # update upper bound
392
      current_upper_bound = new_upper_bound
393
      # add nucleosome to current cluster
394
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1 
395
      nuc_from_track[current_track] = TRUE
396
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
397
				
398
		}
399
			
400
		current_nuc = new_nuc
401

    
402
    # update indexes
403
    if (indexes[current_track] < length(tf_outs[[current_track]]$center)) {      
404
      indexes[current_track] = indexes[current_track] + 1
405
      # update track
406
      track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center
407
    } else {
408
      # update track
409
      track_readers[current_track] = coord_max
410
    }
411
  }
412
  # store last cluster
413
  if (!is.null(new_cluster)) {
414
    # store old cluster
415
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
416
  }
417
	return(list(clusters, lod_scores))
418
### Returns a list of clusterized nucleosomes, and all computed lod scores.
419
}, ex=function(){	
420
	# Dealing with a region of interest
421
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
422
	samples = list()
423
	for (i in 1:3) {
424
		# Create TF output
425
		tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
426
		outputs = dfadd(NULL,tf_nuc)
427
		outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
428
		# Generate corresponding reads
429
		nb_reads = round(runif(1,170,230))
430
		reads = round(rnorm(nb_reads, tf_nuc$center,20))
431
		u_reads = sort(unique(reads))
432
		strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
433
		counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
434
		shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
435
		u_reads = u_reads + shifts
436
		inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)), 
437
		                         "V2" = u_reads, 
438
														 "V3" = strands, 
439
														 "V4" = counts), stringsAsFactors=FALSE)
440
		samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
441
	}
442
	print(aggregate_intra_strain_nucs(samples))
443
})
444

    
445

    
446

    
447

    
448

    
449

    
450
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains.
451
### This function aligns nucs between two strains for a given genome region.
452
replicates, ##<< Set of replicates, ideally 3 per strain. 
453
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's null this list will be computed.
454
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's null this list will be computed.
455
corr_thres=0.5, ##<< Correlation threshold.
456
lod_thres=-100, ##<< LOD cut off.
457
config=NULL, ##<< GLOBAL config variable
458
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.	
459
) {
460
	if (length(replicates) < 2) {
461
		stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.")
462
	} else if (length(replicates) > 2) {
463
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")			
464
	}
465
	common_nuc = NULL
466
	lod_scores = c()
467
	chr = replicates[[1]][[1]]$roi$chr
468
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
469
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
470
	strain_ref1 = replicates[[1]][[1]]$strain
471
	strain_ref2 = replicates[[2]][[1]]$strain
472
	big_roi = replicates[[1]][[1]]$roi
473
	if(big_roi$end - big_roi$begin < 0) {
474
		tmp_begin = big_roi$begin
475
		big_roi$begin =  big_roi$end
476
		big_roi$end =  tmp_begin
477
	}
478
	# print(big_roi)
479
	# GO!
480
	if (is.null(wp_nucs_strain_ref1)) {
481
		wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]]
482
	}
483
	if (is.null(wp_nucs_strain_ref2)) {
484
	  wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]]
485
  }
486
  # foo <<- wp_nucs_strain_ref1
487
  # print(apply(t(wp_nucs_strain_ref1), 2, function(l){c(l[[1]]$lower_bound, l[[1]]$upper_bound, l[[1]]$wp)}))
488
  # print(apply(t(wp_nucs_strain_ref2), 2, function(l){c(l[[1]]$lower_bound, l[[1]]$upper_bound, l[[1]]$wp)}))
489
	# dealing with matching_nas
490
	lws = c()
491
	ups = c()
492
	for (na in wp_nucs_strain_ref2) {
493
		lws = c(lws, na$lower_bound)
494
		ups = c(ups, na$upper_bound)
495
	}
496

    
497
	print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep=""))			
498
	roi_strain_ref1 = NULL
499
	roi_strain_ref2 = NULL
500
	if (length(wp_nucs_strain_ref1) > 0) {
501
		for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){
502
			# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep=""))
503
			nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]]
504
			# Filtering on Well Positionned
505
			if (nuc_strain_ref1$wp) {			
506
				roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1)
507
				roi_strain_ref2 = translate_roi(roi_strain_ref1, strain_ref2, big_roi, config) 
508
        if (!is.null(roi_strain_ref2)){
509
					# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER)
510
					lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin)
511
					upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin)
512
					matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2)
513
					for (index_nuc_strain_ref2 in matching_nas) {
514
						nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]]
515
						# Filtering on Well Positionned
516
						if (nuc_strain_ref2$wp) {
517
							# Filtering on correlation Score and collecting reads					
518
							SKIP = FALSE
519
							# TODO: This for loop could be done before working on strain_ref2. Isn't it?
520
							reads_strain_ref1 = c()
521
							for (nuc in nuc_strain_ref1$nucs){
522
								reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads)							
523
								if (nuc$corr < corr_thres) {
524
									SKIP = TRUE
525
								}
526
							}
527
							reads_strain_ref2 = c()
528
							for (nuc in nuc_strain_ref2$nucs){
529
								reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads)							
530
								if (nuc$corr < corr_thres) {
531
									SKIP = TRUE
532
								}
533
							}
534
							# Filtering on correlation Score						
535
							if (!SKIP) {
536
								# tranlation of reads into strain 2 coords
537
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
538
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
539
								lod_score = lod_score_vecs(reads_strain_ref1, reads_strain_ref2)
540
								lod_scores = c(lod_scores, lod_score)
541
								# Filtering on LOD Score						
542
								if (lod_score > lod_thres) {
543
									tmp_nuc = list()
544
									# strain_ref1
545
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
546
									tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound
547
									tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound
548
									tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5)
549
									tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5)
550
									tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1)
551
									tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1
552
									# tmp_nuc[[paste("corr1_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[1]]$corr,5)
553
									# tmp_nuc[[paste("corr2_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[2]]$corr,5)
554
									# tmp_nuc[[paste("corr3_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[3]]$corr,5)
555
									# strain_ref2
556
									tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr
557
									tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound
558
									tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound
559
									tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5)
560
									tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5)
561
									tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2)
562
									tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2
563
									# tmp_nuc[[paste("corr1_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[1]]$corr,5)
564
									# tmp_nuc[[paste("corr2_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[2]]$corr,5)
565
									# tmp_nuc[[paste("corr3_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[3]]$corr,5)
566
									# common
567
									tmp_nuc[["lod_score"]] = signif(lod_score,5)
568
									# print(tmp_nuc)
569
									common_nuc = dfadd(common_nuc, tmp_nuc)
570
								}
571
							}
572
						}
573
					}				
574
				} else {
575
		      print("WARNING! No roi for strain ref 2.")
576
			  }
577
		  }
578
		}
579
		
580
		if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) {
581
			index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] ==  common_nuc[,1:3][-1,] ,1,sum) == 3)
582
			to_remove_list = c()			
583
			for (i in 1:length(index_redundant)) {	
584
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
585
				  to_remove = index_redundant[i]
586
				}	 else {
587
					to_remove = index_redundant[i] + 1
588
			  } 		
589
				to_remove_list = c(to_remove_list, to_remove)
590
			}
591
			common_nuc = common_nuc[-to_remove_list,]
592
		}
593
		
594
		if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) {
595
			index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] ==  common_nuc[,8:10][-1,] ,1,sum) == 3)
596
			to_remove_list = c()			
597
			for (i in 1:length(index_redundant)) {	
598
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
599
				  to_remove = index_redundant[i]
600
				}	 else {
601
					to_remove = index_redundant[i] + 1
602
			  } 		
603
				to_remove_list = c(to_remove_list, to_remove)
604
			}
605
			common_nuc = common_nuc[-to_remove_list,]
606
		}
607
				
608
		return(list(common_nuc, lod_scores))
609
	} else {
610
		print("WARNING, no nucs for strain_ref1.")
611
		return(NULL)
612
	}
613
### Returns a list of clusterized nucleosomes, and all computed lod scores.
614
}, ex=function(){	
615

    
616
    # Define new translate_roi function...
617
    translate_roi = function(roi, strain2, big_roi=NULL, config=NULL) {
618
      return(roi)
619
    }
620
    # Binding it by uncomment follwing lines.
621
    unlockBinding("translate_roi", as.environment("package:nucleominer"))
622
    unlockBinding("translate_roi", getNamespace("nucleominer"))
623
    assign("translate_roi", translate_roi, "package:nucleominer")
624
    assign("translate_roi", translate_roi, getNamespace("nucleominer"))
625
    lockBinding("translate_roi", getNamespace("nucleominer"))
626
    lockBinding("translate_roi", as.environment("package:nucleominer"))  
627

    
628
	# Dealing with a region of interest
629
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1")
630
	roi2 = translate_roi(roi, roi$strain_ref1)
631
	replicates = list()
632
	for (j in 1:2) {
633
		samples = list()
634
		for (i in 1:3) {
635
			# Create TF output
636
			tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
637
			outputs = dfadd(NULL,tf_nuc)
638
			outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
639
			# Generate corresponding reads
640
			nb_reads = round(runif(1,170,230))
641
			reads = round(rnorm(nb_reads, tf_nuc$center,20))
642
			u_reads = sort(unique(reads))
643
			strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
644
			counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
645
			shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
646
			u_reads = u_reads + shifts
647
			inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)), 
648
			                         "V2" = u_reads, 
649
															 "V3" = strands, 
650
															 "V4" = counts), stringsAsFactors=FALSE)
651
			samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
652
		}
653
		replicates[[length(replicates) + 1]] = samples
654
	}
655
	print(align_inter_strain_nucs(replicates))
656
})
657

    
658

    
659

    
660

    
661

    
662

    
663

    
664

    
665

    
666

    
667

    
668

    
669

    
670

    
671

    
672

    
673

    
674

    
675

    
676

    
677

    
678

    
679

    
680

    
681

    
682

    
683
fetch_mnase_replicates = function(# Prefetch data
684
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates.
685
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids.
686
roi, ##<< Region of interest.
687
all_samples, ##<< Global list of samples.
688
config=NULL, ##<< GLOBAL config variable
689
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking.
690
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence.
691
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files.
692
) {
693
	samples=list()
694
  samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id)
695
	for (i in samples_ids) {
696
		sample = as.list(all_samples[all_samples$id==i,])
697
    sample$roi = translate_roi(roi, sample$strain, config)
698
		if (get_genome) {
699
			# Get Genome
700
      sample$roi$genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]], "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]		
701
		}
702
		# Get inputs
703
		sample$inputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), "table", stringsAsFactors=FALSE) 
704
		sample$total_reads = sum(sample$inputs[,4]) 	
705
		if (!only_fetch) {
706
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
707
	  }
708
	  # Get TF outputs for Mnase_Seq samples
709
		if (sample$marker == "Mnase_Seq" & get_ouputs) {	
710
			sample$outputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), "table", header=TRUE, sep="\t")
711
			if (!only_fetch) {
712
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
713
  		}
714
		}
715
		samples[[length(samples) + 1]] = sample		
716
	}
717
  return(samples)
718
}
719

    
720
substract_region = function(# Substract to a list of regions an other list of regions that intersect it.
721
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two.
722
region1, ##<< Original regions. 
723
region2 ##<< Regions to substract.
724
) {
725
  rec_substract_region = function(region1, region2) {
726
  non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) {
727
    cur_fuzzy = region1[i,]
728
    inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,]
729
    if (length(inter_wp[,1]) > 0) {
730
      ret = c()
731
      for (j in 1:length(inter_wp[,1])) {
732
        cur_wp = inter_wp[j,]
733
        if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
734
          # remove cur_fuzzy
735
          ret = c()
736
          break
737
        } else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
738
          # crop fuzzy 
739
          cur_fuzzy$lower_bound = cur_wp$upper_bound + 1 
740
          ret = cur_fuzzy
741
        } else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
742
          # crop fuzzy 
743
          cur_fuzzy$upper_bound = cur_wp$lower_bound - 1 
744
          ret = cur_fuzzy
745
        } else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
746
          # split fuzzy
747
          tmp_ret_fuzzy_1 = cur_fuzzy
748
          tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1 
749
          tmp_ret_fuzzy_2 = cur_fuzzy
750
          tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1 
751
          ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp)
752
          # print(ret)
753
          # ret = cur_fuzzy
754
          break
755
        } else {
756
          stop("WARNING NO ADAPTED CASE!")
757
        }          
758
      }
759
      return(ret)
760
    } else {
761
      return(cur_fuzzy)
762
    }
763
  })
764
  }
765
  non_inter_fuzzy = rec_substract_region(region1, region2)
766
  if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)}
767
  tmp_ulist = unlist(non_inter_fuzzy)
768
  tmp_names = names(tmp_ulist)[1:4]
769
  non_inter_fuzzy = data.frame(matrix(tmp_ulist,ncol=4, byrow=TRUE), stringsAsFactors=FALSE) 
770
  names(non_inter_fuzzy) = tmp_names
771
  non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound)
772
  non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound)
773
  return(non_inter_fuzzy)
774
}
775

    
776
union_regions = function(# Aggregate regions that intersect themnselves.
777
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable.  
778
regions ##<< The Regions to be aggregated
779
) {
780
  old_length = length(regions[,1])
781
  new_length = 0
782
    
783
  while (old_length != new_length) {
784
    regions = regions[order(regions$lower_bound), ]
785
    regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 0, TRUE)      
786
    vec_end_1 = which(regions$stop)
787
    if (length(vec_end_1) == 0) {
788
      vec_end_1 = c(length(regions$stop))      
789
    } 
790
    if (vec_end_1[length(vec_end_1)] != length(regions$stop)) {
791
      vec_end_1 = c(vec_end_1, length(regions$stop))
792
    }
793
    vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1)
794
    union = apply(t(1:length(vec_beg_1)), 2, function(i) {
795
      chr = regions$chr[vec_beg_1[i]]
796
      lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]])
797
      upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]])
798
      roi_index = regions$roi_index[vec_beg_1[i]] 
799
      data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, roi_index=roi_index))
800
      })
801
    union = do.call("rbind", union)        
802
    union$lower_bound = as.numeric(union$lower_bound)
803
    union$upper_bound = as.numeric(union$upper_bound)
804
    old_length = length(regions[,1])
805
    new_length = length(union[,1])
806
    regions = union
807
  }
808
  return(union)
809
}
810

    
811
remove_aligned_wp = function(# Remove wp nucs from common nucs list.
812
### It is based on common wp nucs index on nucs and region.  
813
strain_maps, ##<< Nuc maps.
814
roi_index, ##<< The region of interest index.
815
tmp_common_nucs, ##<< the list of wp nucs.
816
strain##<< The strain to consider.
817
){
818
  fuzzy_nucs = strain_maps[[strain]]
819
  fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$roi_index == roi_index,]
820
  fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),]
821
  if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)}
822
  if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"}
823
  anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]
824
  fuzzy_nucs = fuzzy_nucs[-anti_index_1,]
825
  return(fuzzy_nucs)
826
}
827

    
828
translate_regions = function(# Translate a list of regions from a strain ref to another.
829
### This function is an eloborated call to translate_roi.
830
regions, ##<< Regions to be translated. 
831
combi, ##<< Combination of strains.
832
roi_index, ##<< The region of interest index.
833
config=NULL, ##<< GLOBAL config variable
834
roi ##<< The region of interest.
835
) {
836
  tr_regions = apply(t(1:length(regions[,1])), 2, function(i) {
837
    tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2])
838
    big_roi =  translate_roi(roi, tmp_regions_ref2$strain_ref, config)
839
    tmp_min = min(big_roi$begin, big_roi$end)
840
    tmp_max = max(big_roi$begin, big_roi$end)
841
    big_roi$begin = tmp_min
842
    big_roi$end = tmp_max
843
    trs_tmp_regions_ref2 = translate_roi(tmp_regions_ref2, combi[1], config, big_roi) 
844
    data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), roi_index=roi_index))
845
    })
846
  tr_regions = do.call("rbind", tr_regions)      
847
  tr_regions$lower_bound = as.numeric(tr_regions$lower_bound)
848
  tr_regions$upper_bound = as.numeric(tr_regions$upper_bound)
849
  tr_regions = tr_regions[order(tr_regions$lower_bound),]      
850
  return(tr_regions)
851
}
852

    
853
extract_wp = function(# Extract wp nucs from nuc map. 
854
### Function based on common wp nuc index and roi_index.
855
strain_maps, ##<< Nuc maps.
856
roi_index, ##<< The region of interest index.
857
strain, ##<< The strain to consider.
858
tmp_common_nucs ##<< the list of wp nucs.
859
) {
860
  wp_nucs = apply(t(tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]), 2, function(i) {
861
    tmp_wp_nucs = strain_maps[[strain]]
862
    tmp_wp_nucs = tmp_wp_nucs[tmp_wp_nucs$roi_index == roi_index & tmp_wp_nucs$index_nuc == i,]
863
    return(tmp_wp_nucs)
864
    })
865
  wp_nucs = do.call("rbind", wp_nucs)      
866
  wp_nucs$lower_bound = as.numeric(wp_nucs$lower_bound)
867
  wp_nucs$upper_bound = as.numeric(wp_nucs$upper_bound)
868
  wp_nucs = wp_nucs[order(wp_nucs$lower_bound),]      
869
  return(wp_nucs)
870
}
871

    
872

    
873

    
874

    
875

    
876
crop_fuzzy = function(# Crop bound of regions according to region of interest bound
877
### The fucntion is no more necessary since we remove "big_roi" bug in translate_roi function.
878
tmp_fuzzy_nucs, ##<< the regiuons to be croped.
879
roi, ##<< The region of interest.
880
strain, ##<< The strain to consider.
881
config=NULL ##<< GLOBAL config variable
882
) {
883
  tr_roi = translate_roi(roi, strain, config)
884
  tr_roi_begin = min(tr_roi$begin, tr_roi$end)
885
  tr_roi_end = max(tr_roi$begin, tr_roi$end)
886
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) {
887
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin
888
  }
889
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) {
890
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin
891
  }
892
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) {
893
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end
894
  }
895
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) {
896
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end
897
  }
898
  tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,]
899
  return(tmp_fuzzy_nucs)
900
}
901

    
902

    
903
get_fuzzy = function(# Compute the fuzzy nucs.
904
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting fuzzy regions. It will be take into account in the count read part og the pipeline.
905
combi, ##<< The strain combination to consider.
906
roi, ##<< The region of interest.
907
roi_index, ##<< The region of interest index.
908
strain_maps, ##<< Nuc maps.
909
common_nuc_results, ##<< Common wp nuc maps
910
config=NULL ##<< GLOBAL config variable
911
) {
912
  print(roi_index)
913
  PLOT = FALSE
914
  tmp_common_nucs = common_nuc_results[[paste(combi[1], combi[2], sep="_")]]
915
  tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$roi_index == roi_index, ]
916

    
917
  print(paste("Dealing with fuzzy from", combi[1]))
918
  tmp_fuzzy_nucs_1 = remove_aligned_wp(strain_maps, roi_index, tmp_common_nucs, combi[1])
919
  tmp_fuzzy_nucs_1 = crop_fuzzy(tmp_fuzzy_nucs_1, roi, combi[1], config)
920
  if (length(tmp_fuzzy_nucs_1[,1]) == 0) {return(NULL)}
921
  agg_fuzzy_1 = union_regions(tmp_fuzzy_nucs_1)    
922
  if (PLOT) for (i in 1:length(agg_fuzzy_1[,1])) {
923
    lines(c(agg_fuzzy_1[i,]$lower_bound, agg_fuzzy_1[i,]$upper_bound), c(+3.1,+3.1), col=2)
924
  }
925

    
926
  print(paste("Dealing with fuzzy from ", combi[2]))
927
  tmp_fuzzy_nucs_2 = remove_aligned_wp(strain_maps, roi_index, tmp_common_nucs, combi[2])
928
  if (length(tmp_fuzzy_nucs_2[,1]) == 0) {return(NULL)}
929
  agg_fuzzy_2 = union_regions(tmp_fuzzy_nucs_2)      
930
  agg_fuzzy_2 = crop_fuzzy(agg_fuzzy_2, roi, combi[2], config)
931
  tr_agg_fuzzy_2 = translate_regions(agg_fuzzy_2, combi, roi_index, roi)
932
  tr_agg_fuzzy_2 = crop_fuzzy(tr_agg_fuzzy_2, roi, combi[2], config)
933
  # tr_agg_fuzzy_2 = union_regions(tr_agg_fuzzy_2)
934
  if (PLOT) for (i in 1:length(tr_agg_fuzzy_2[,1])) {
935
    lines(c(tr_agg_fuzzy_2[i,]$lower_bound, tr_agg_fuzzy_2[i,]$upper_bound), c(+3.3,+3.3), col=2)
936
  }
937
  
938
  print("Dealing with fuzzy from both...")
939
  all_fuzzy = union_regions(rbind(agg_fuzzy_1, tr_agg_fuzzy_2))
940
  if (PLOT) for (i in 1:length(all_fuzzy[,1])) {
941
    lines(c(all_fuzzy[i,]$lower_bound, all_fuzzy[i,]$upper_bound), c(+3.2, +3.2), col=1)
942
  }
943
  
944
  print(paste("Dealing with wp from", combi[1]))
945
  wp_nucs_1 = extract_wp(strain_maps, roi_index, combi[1], tmp_common_nucs)
946
  if (PLOT) for (i in 1:length(wp_nucs_1[,1])) {
947
    lines(c(wp_nucs_1[i,]$lower_bound, wp_nucs_1[i,]$upper_bound), c(+3.5,+3.5), col=3)
948
  }
949

    
950
  print(paste("Dealing with wp from", combi[2]))
951
  wp_nucs_2 = extract_wp(strain_maps, roi_index, combi[2], tmp_common_nucs)
952
  tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, roi_index, roi)
953
  if (PLOT) for (i in 1:length(tr_wp_nucs_2[,1])) {
954
    lines(c(tr_wp_nucs_2[i,]$lower_bound, tr_wp_nucs_2[i,]$upper_bound), c(+3.7,+3.7), col=3)
955
  }
956

    
957
  print("Dealing with wp from both...")
958
  all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2))
959
  if (PLOT) for (i in 1:length(all_wp[,1])) {
960
    lines(c(all_wp[i,]$lower_bound, all_wp[i,]$upper_bound), c(+3.6, +3.6), col=1)
961
  }  
962

    
963
  print("Dealing with fuzzy and wp...")
964
  non_inter_fuzzy = substract_region(all_fuzzy, all_wp)
965
  if (is.null(non_inter_fuzzy)) { return(NULL) }
966
  
967
  non_inter_fuzzy$len = non_inter_fuzzy$upper_bound - non_inter_fuzzy$lower_bound
968
  # non_inter_fuzzy = non_inter_fuzzy[non_inter_fuzzy$len >= min_fuzz_width,]
969
  if (PLOT) for (i in 1:length(non_inter_fuzzy[,1])) {
970
    lines(c(non_inter_fuzzy[i,]$lower_bound, non_inter_fuzzy[i,]$upper_bound), c(+3.9, +3.9), col=1)
971
  }
972
  
973
  non_inter_fuzzy$index_nuc = 1:length(non_inter_fuzzy[,1])
974
  return (non_inter_fuzzy)
975
}
976

    
977

    
978

    
979
get_all_reads = function(# Retrieve Reads 
980
### Retrieve reads for a given marker, combi, form.
981
marker, ##<< The marker to considere.
982
combi, ##<< The starin combination to considere.
983
form="wp" ##<< The nuc form to considere.
984
) {
985
	all_reads = NULL
986
  for (manip in c("Mnase_Seq", marker)) {
987
    if (form == "fuzzy") {
988
		  out_filename = paste("results/2013-08/",combi[1],"_",combi[2],"_",manip,"_fuzzy_and_nbreads.tab",sep="")
989
  		tmp_res = read.table(file=out_filename, header=TRUE)				
990
			tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,]
991
      tmp_res$form = form
992
    } else if (form == "wp") {
993
		 	out_filename = paste("results/2013-08/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
994
  		tmp_res = read.table(file=out_filename, header=TRUE)				
995
      tmp_res$form = form
996
    } else if (form == "wpfuzzy") {
997
		 	out_filename = paste("results/2013-08/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
998
  		tmp_res = read.table(file=out_filename, header=TRUE)				
999
      tmp_res$form = "wp"
1000
		  out_filename = paste("results/2013-08/",combi[1],"_",combi[2],"_",manip,"_fuzzy_and_nbreads.tab",sep="")
1001
  		tmp_res2 = read.table(file=out_filename, header=TRUE)				
1002
			tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,]
1003
      tmp_res2$form = "fuzzy"
1004
      tmp_res = rbind(tmp_res, tmp_res2)
1005
    }
1006
		if (is.null(all_reads)) {
1007
			all_reads = tmp_res[,c(1:9,length(tmp_res))]
1008
		}     
1009
		tmp_res = tmp_res[,-c(1:9,length(tmp_res))]
1010
		all_reads = cbind(all_reads, tmp_res)
1011
  }
1012
  return(all_reads)	
1013
}
1014

    
1015
get_design = function(# Build the design for deseq
1016
### This function build the design according sample properties.
1017
marker, ##<< The marker to considere.
1018
combi, ##<< The starin combination to considere.
1019
all_samples ##<< Global list of samples.
1020
) {
1021
  off1 = 0
1022
  off2 = 0
1023
	manips = c("Mnase_Seq", marker)
1024
	design_rownames = c()
1025
	design_manip = c()
1026
	design_strain = c()
1027
  off2index = function(off) {	
1028
  	switch(toString(off), 
1029
  		"1"=c(0,1,1),
1030
  	  "2"=c(1,0,1),
1031
    	"3"=c(1,1,0), 
1032
  		c(1,1,1)
1033
  		)	
1034
  }
1035
	for (manip in manips) {
1036
		tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) &  all_samples$marker == manip), ]
1037
		tmp_samples = tmp_samples[order(tmp_samples$strain), ]
1038
		if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) {
1039
			tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,]
1040
		} else {
1041
			if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) {
1042
				tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,]
1043
			}
1044
		}
1045
		design_manip = c(design_manip, rep(manip, length(tmp_samples$id)))				
1046
		for (strain in combi) {			
1047
			cols = apply(t(tmp_samples[ (tmp_samples$strain == strain &  tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")})
1048
			design_strain = c(design_strain, rep(strain, length(cols)))
1049
			design_rownames = c(design_rownames, cols)
1050
		}
1051
	}
1052
	snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain)
1053
	return(snep_design)	
1054
}
1055

    
1056
plot_dist_samples = function(# Plot the distribution of reads.
1057
### This fuxntion use the deseq nomalization feature to compare qualitatively the distribution.
1058
strain, ##<< The strain to considere.
1059
marker, ##<< The marker to considere.
1060
res, ##<< Data
1061
all_samples, ##<< Global list of samples.
1062
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot.
1063
) {			
1064
	cols = apply(t(all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")})
1065
	snepCountTable = res[,cols]
1066
	snepDesign = data.frame(
1067
		row.names = cols,
1068
		manip = rep(marker, length(cols)),
1069
		strain = rep(strain, length(cols))
1070
		)
1071
	cdsFull = newCountDataSet(snepCountTable, snepDesign)
1072
	sizeFactors = estimateSizeFactors(cdsFull)
1073
	# print(sizeFactors[[1]])
1074
	sample_ids = all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id
1075
	if (NEWPLOT) {
1076
		plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker))				
1077
		NEWPLOT = FALSE
1078
	}
1079
	for (it in 1:length(sample_ids)) {
1080
		sample_id = sample_ids[it]
1081
		lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it)
1082
	}
1083
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
1084
}
1085

    
1086
analyse_design = function(# Launch deseq methods.
1087
### This function is based on deseq example. It mormalizes data, fit data to GLM model with and without interaction term and compare the two l;=models.
1088
snep_design, ##<< The design to considere.
1089
reads ##<< The data to considere. 
1090
) {
1091
	snep_count_table = reads[, rownames(snep_design)]
1092
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1093
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1094
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1095

    
1096
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1097
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1098

    
1099
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1100
	return(list(fit1, fit0, snep_design, pvalsGLM))	
1101
}
1102

    
1103

    
1104

    
1105

    
1106

    
1107

    
1108

    
1109

    
1110
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1111
### This function uses 
1112
marker, ##<< The marker involved.
1113
combi, ##<< The strain combination involved.
1114
form, ##<< the nuc form involved. 
1115
all_samples ##<< Global list of samples.
1116
) {
1117
  # PRETREAT
1118
  d = get_design(marker, combi, all_samples)
1119
  reads = get_all_reads(marker, combi, form)
1120
  # RUN ANALYSE
1121
  tmp_analyse = analyse_design(d, reads)
1122
  # RESULTS
1123
	fit1 = tmp_analyse[[1]]
1124
	fit0 = tmp_analyse[[2]]
1125
  k = names(fit1)
1126
  reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1127
  reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1128
  reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1129
	reads$pvalsGLM = signif(tmp_analyse[[4]], 5)
1130
	snep_design = tmp_analyse[[3]]
1131
  print(snep_design)
1132
	fdr = 0.0001
1133
	thres = FDR(reads$pvalsGLM, fdr) 
1134
	reads$snep_index = reads$pvalsGLM < thres
1135
	print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", fdr*100,"% of FDR.", sep = ""))
1136
  return(reads)
1137
  },  ex=function(){	
1138
    marker = "H3K4me1"
1139
    combi = c("BY", "YJM") 
1140
    form = "wpfuzzy" # "wp" | "fuzzy" | "wpfuzzy"
1141
    # foo = get_sneps(marker, combi, form)
1142
    # foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
1143
})
1144

    
1145

    
1146

    
1147

    
1148

    
1149

    
1150

    
1151

    
1152

    
1153

    
1154

    
1155

    
1156

    
1157

    
1158

    
1159

    
1160

    
1161

    
1162

    
1163

    
1164

    
1165

    
1166

    
1167

    
1168
ROM2ARAB = function(# Roman to Arabic pair list.
1169
### Util to convert Roman to Arabic
1170
){list(
1171
  "I" = 1,
1172
  "II" = 2,
1173
  "III" = 3,
1174
  "IV" = 4,
1175
  "V" = 5,
1176
  "VI" = 6,
1177
  "VII" = 7,
1178
  "VIII" = 8,
1179
  "IX" = 9,
1180
  "X" = 10,
1181
  "XI" = 11,
1182
  "XII" = 12,
1183
  "XIII" = 13,
1184
  "XIV" = 14,
1185
  "XV" = 15,
1186
  "XVI" = 16,
1187
  "XVII" = 17,
1188
  "XVIII" = 18,
1189
  "XIX" = 19,
1190
  "XX" = 20
1191
)}
1192

    
1193
switch_pairlist = structure(function(# Switch a pairlist
1194
### Take a pairlist key:value and return the switched pairlist value:key.
1195
l ##<< The pairlist to switch. 
1196
) {
1197
	ret = list()
1198
	for (name in names(l)) {
1199
		ret[[as.character(l[[name]])]] = name
1200
	}
1201
	ret
1202
### The switched pairlist.
1203
}, ex=function(){
1204
	l = list(key1 = "value1", key2 = "value2")
1205
	print(switch_pairlist(l))
1206
})
1207

    
1208
ARAB2ROM = function(# Arabic to Roman pair list.
1209
### Util to convert Arabicto Roman
1210
){switch_pairlist(ROM2ARAB())}
1211

    
1212

    
1213
# translate_roi = structure(function(# Translate coords of a genome region.
1214
# ### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1215
# roi, ##<< Original genome region of interest.
1216
# strain2, ##<< The strain in wich you want the genome region of interest.
1217
# big_roi=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1218
# ) {
1219
# ### This function translate a genome region of interest from a strain coord system to an other. This is a minimal fucntion that will be called by \emph{align_inter_strain_nucs} and its exemnple, you need to overwrite it by your own fucntion.
1220
#   strain1 = roi$strain_ref
1221
#   if (strain1 == strain2 | strain2 == "strain_ex2") {
1222
#     return(roi)
1223
#   } else {
1224
#     stop("ERROR, you need to overwrite your own function to convert convert strain coords. To binf the new function, have a 
1225
#     look in the translate_roi example.")    
1226
#   }
1227
# ### The translated genome region of interest. 
1228
# }, ex=function(){
1229
#   # Define new translate_roi function...
1230
#   translate_roi = function(roi, strain2) {
1231
#     strain1 = roi$strain_ref
1232
#     if (strain1 == strain2) {
1233
#       return(roi)
1234
#     } else {
1235
#       stop("Here is my new translate_roi function...")    
1236
#     }  
1237
#   }
1238
#   # Binding it by uncomment follwing lines.
1239
#   # unlockBinding("translate_roi", as.environment("package:nm"))
1240
#   # unlockBinding("translate_roi", getNamespace("nm"))
1241
#   # assign("translate_roi", translate_roi, "package:nm")
1242
#   # assign("translate_roi", translate_roi, getNamespace("nm"))
1243
#   # lockBinding("translate_roi", getNamespace("nm"))
1244
#   # lockBinding("translate_roi", as.environment("package:nm"))  
1245
# })
1246
# 
1247
# 
1248

    
1249

    
1250
translate_roi = structure(function(# Translate coords of a genome region.
1251
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1252
roi, ##<< Original genome region of interest.
1253
strain2, ##<< The strain in wich you want the genome region of interest.
1254
config=NULL, ##<< GLOBAL config variable
1255
big_roi=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1256
) {
1257
	strain1 = roi$strain_ref
1258
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1259
	if (strain1 == strain2) {
1260
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1	
1261
		return(roi)
1262
	}
1263
	# Launch c2c file
1264
	if (reverse) {
1265
		c2c_file = list(filename=config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]])
1266
	} else {
1267
		c2c_file = list(filename=config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]])
1268
	}
1269
	c2c = get_content(c2c_file$filename, "table", stringsAsFactors=FALSE)	
1270
	# filtering it
1271
  c2c = c2c[c2c$V6=="-",]
1272
	# Reverse
1273
	if (reverse) {
1274
		tmp_col = c2c$V1
1275
		c2c$V1 = c2c$V7
1276
		c2c$V7 = tmp_col
1277
		tmp_col = c2c$V2
1278
		c2c$V2 = c2c$V9
1279
		c2c$V9 = tmp_col
1280
		tmp_col = c2c$V3
1281
		c2c$V3 = c2c$V10
1282
		c2c$V10 = tmp_col
1283
	}
1284
	# Restrict c2c to big_roi
1285
	# if (FALSE) {
1286
	if (!is.null(big_roi)) {
1287
		if (roi$strain_ref == big_roi$strain_ref) {
1288
			if (strain1 == "BY") {
1289
				big_chro_1 = paste("chr", ARAB2ROM()[[big_roi$chr]], sep="")
1290
			} else if (strain1 == "RM") {			
1291
			  big_chro_1 = paste("supercontig_1.",big_roi$chr,sep="")
1292
			} else if (strain1 == "YJM") {			
1293
			  big_chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[big_roi$chr]]
1294
			}
1295
			big_begin_1 = big_roi$begin
1296
		  big_end_1 = big_roi$end
1297
			c2c = c2c[c2c$V1==big_chro_1,]
1298
      if (length(c2c[c2c$V3<big_begin_1, 1] > 0)) {c2c[c2c$V3 < big_begin_1,c("V2", "V3") ] = big_begin_1}
1299
      if (length(c2c[c2c$V2>big_end_1, 1] > 0)) {c2c[c2c$V2 > big_end_1, c("V2", "V3")] = big_end_1}
1300
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1301
		} else {
1302
			stop("ERROR, big_roi and roi not in the same strain_ref")
1303
		}
1304
	}
1305
  #	Convert initial roi$chr into c2c format
1306
	if (strain1 == "BY") {
1307
		chro_1 = paste("chr", ARAB2ROM()[[roi$chr]], sep="")
1308
	} else if (strain1 == "RM") {			
1309
	  chro_1 = paste("supercontig_1.",roi$chr,sep="")
1310
	} else if (strain1 == "YJM") {			
1311
	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[roi$chr]]
1312
	}
1313
	begin_1 = roi$begin
1314
  end_1 = roi$end
1315
  # Computing equivalent strain_2 alignment coordinates	
1316
	if (reverse) {
1317
  	tmptransfostart = c2c[c2c$V1==chro_1 & ((c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1)),]
1318
    tmptransfostop = c2c[c2c$V1==chro_1 &  ((c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1)),]
1319
	} else {
1320
		tmptransfostart = c2c[c2c$V1==chro_1 & c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1321
	  tmptransfostop = c2c[c2c$V1==chro_1 & c2c$V3>=end_1 & c2c$V2<=end_1,]
1322
	}
1323
	# Never happend conditions ...	
1324
	{
1325
		if (length(tmptransfostart$V8) == 0) {
1326
			# begin_1 is between to lines: shift begin_1 to the start of 2nd line.
1327
			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V2>=begin_1,]
1328
			begin_1 = sort(tmp_c2c$V2)[1]
1329
			if (reverse) {
1330
		  	tmptransfostart = c2c[c2c$V1==chro_1 & ((c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1)),]
1331
			} else {
1332
				tmptransfostart = c2c[c2c$V1==chro_1 & c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1333
			}
1334
			if (length(tmptransfostart$V8) == 0) {
1335
				if (!is.null(big_roi)) {
1336
					return(NULL)
1337
					tmptransfostart = c2c[c2c$V1==chro_1 & c2c$V3>=big_roi$begin & c2c$V2<=big_roi$begin,]
1338
				} else { 
1339
					# return(NULL)
1340
					# print(c2c[c2c$V1==chro_1 & c2c$V2<=end_1 & c2c$V3>=begin_1,])
1341
					# print(c2c[c2c$V1==chro_1,])
1342
					print(tmptransfostart)
1343
					print(tmptransfostop)
1344
					stop("Never happend condition 1.")					
1345
				}
1346
			}
1347
		} 
1348
		if (length(tmptransfostop$V8) == 0) {
1349
			# end_1 is between to lines: shift end_1 to the end of 2nd line.
1350
			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V3<=end_1,]
1351
			end_1 = sort(tmp_c2c$V3)[length(tmp_c2c$V2)]
1352
			if (reverse) {
1353
		    tmptransfostop = c2c[c2c$V1==chro_1 &  ((c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1)),]
1354
			} else {
1355
			  tmptransfostop = c2c[c2c$V1==chro_1 & c2c$V3>=end_1 & c2c$V2<=end_1,]
1356
			}
1357
			if (length(tmptransfostop$V8) == 0) {
1358
				if (!is.null(big_roi)) {
1359
					return(NULL)
1360
				  tmptransfostop = c2c[c2c$V1==chro_1 & c2c$V3>=big_roi$end & c2c$V2<=big_roi$end,]
1361
				} else {
1362
					# return(NULL)
1363
					print(c2c[c2c$V1==chro_1,])
1364
					print(tmptransfostart)
1365
					print(tmptransfostop)
1366
					stop("Never happend condition 2.")
1367
				} 
1368
			} 
1369
		}
1370
		if (length(tmptransfostart$V8) != 1) {
1371
			# tmptransfostart = tmptransfostart[1,]
1372
			# print("many start")
1373
			# print(c2c[c2c$V1==chro_1,])
1374
			tmptransfostart = tmptransfostart[tmptransfostart$V1==chro_1 & tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,]
1375
			if (length(tmptransfostart$V8) != 1) {
1376
				# return(NULL)
1377
				print(tmptransfostart)
1378
				print(tmptransfostop)
1379
  			stop("Never happend condition 3.")
1380
			}
1381
		}
1382
		if (length(tmptransfostop$V8) != 1) {
1383
			# tmptransfostop = tmptransfostop[length(tmptransfostop$V8),]
1384
			# print("many stop")
1385
			# print(tmptransfostop)
1386
			# print(roi)
1387
		  tmptransfostop = tmptransfostop[tmptransfostop$V1==chro_1 & tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,]
1388
			if (length(tmptransfostop$V8) != 1) {
1389
				# return(NULL)
1390
				print(tmptransfostart)
1391
				print(tmptransfostop)
1392
  			stop("Never happend condition 4.")
1393
			}
1394
		}		
1395
		if (tmptransfostart$V7 != tmptransfostop$V7) {
1396
			print(tmptransfostart)
1397
			print(tmptransfostop)		
1398
 			stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.")
1399
		}
1400
	} 
1401
  # Deal with strand
1402
  if (tmptransfostart$V8 == 1) {
1403
    begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2)
1404
    end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2)
1405
  } else {
1406
    begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2)
1407
    end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2)
1408
  }
1409
	# Build returned roi
1410
	roi$strain_ref = strain2
1411
	if (roi$strain_ref == "BY") {
1412
		roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]]
1413
	} else {
1414
		roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]]
1415
	}
1416
  roi$begin = begin_2
1417
  roi$end = end_2
1418
	if (sign(roi$end - roi$begin) == 0) {
1419
		roi$length = 1			
1420
	} else {		
1421
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1	
1422
	}
1423
  return(roi)  
1424
}, ex=function(){
1425
	# Define new translate_roi function...
1426
	translate_roi = function(roi, strain2, config) {
1427
		strain1 = roi$strain_ref
1428
		if (strain1 == strain2) {
1429
			return(roi)
1430
		} else {
1431
		  stop("Here is my new translate_roi function...")		
1432
		}	
1433
	}
1434
	# Binding it by uncomment follwing lines.
1435
	# unlockBinding("translate_roi", as.environment("package:nm"))
1436
	# unlockBinding("translate_roi", getNamespace("nm"))
1437
	# assign("translate_roi", translate_roi, "package:nm")
1438
	# assign("translate_roi", translate_roi, getNamespace("nm"))
1439
	# lockBinding("translate_roi", getNamespace("nm"))
1440
	# lockBinding("translate_roi", as.environment("package:nm"))	
1441
})
1442

    
1443

    
1444

    
1445

    
1446

    
1447

    
1448

    
1449
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
1450
### CURs are regions that can be aligned between the genomes 
1451
diff_allowed = 10, ##<< the maximum indel width allowe din a CUR
1452
min_cur_width = 200, ##<< The minimum width of a CUR
1453
config=NULL, ##<< GLOBAL config variable
1454
plot = FALSE ##<< Plot CURs or not
1455
) {
1456
  get_inter_strain_rois = function(strain1, strain2, diff_allowed = 10, min_cur_width = 200, plot=FALSE) {
1457
  	c2c_file = list(filename=config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]])
1458
  	c2c = get_content(c2c_file$filename, "table", stringsAsFactors=FALSE)
1459
    # Filtering unagapped
1460
    c2c = c2c[c2c$V6=="-",]
1461
    # filtering some things (chr...)
1462
    # c2c = c2c[c2c$V1 == "chrIV",]
1463
    diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)] 
1464
    diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)] 
1465
  	# Plot diffs to define a threshold (diff_allowed)
1466
  	# hist(abs(c(diff2, diff)),breaks=c(0:2000, 200000000000), xlim=c(0,100))
1467
    # Filtering
1468
  	indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed)
1469
  	indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1))
1470

    
1471
    rois = NULL	
1472
  	for(i in 1:length(indexes_start)) {
1473
  		start = indexes_start[i]
1474
  		stop = indexes_stop[i]
1475
  		sub_c2c = c2c[start:stop,]
1476
  		if (strain1 == "BY") {
1477
  			chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]]
1478
  		} else {			
1479
  			chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]]
1480
  		}
1481
  		roi = list(chr=chr, begin=sub_c2c[1,]$V2, end=sub_c2c[length(sub_c2c$V1),]$V3, strain_ref=strain1)
1482
  		roi[["length"]] = roi$end - roi$begin
1483
  		if (roi$length >= min_cur_width) {
1484
        rois = dfadd(rois,roi)
1485
  	  }
1486
  		if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) {
1487
  			print("*************** ERROR, non homogenous region! ********************")
1488
  		}
1489
  		# print(i)
1490
  		# print(roi)
1491
  		# print(sub_c2c)
1492
  		# print("________________________________________________________________")
1493
  	}
1494
  	if (plot) {
1495
  		print(paste(length(indexes_stop), "area of interest."))
1496
  	  # Plot rois
1497
  	  fasta_ref = list(filename=config$FASTA_REFERENCE_GENOME_FILES[[strain1]])
1498
  	  genome = get_content(fasta_ref$filename, "fasta")   
1499
  		plot(0,0, ylim=(c(1,length(genome))), xlim = c(0, max(apply(t(genome), 2, function(chr){length(unlist(chr))}))))
1500
  		for (name in names(genome)) {
1501
  			if (strain1 == "BY") {
1502
  				chr_ref = paste("chr", ARAB2ROM()[[config$FASTA_INDEXES[[strain1]][[name]]]], sep="")
1503
  			} else {			
1504
  				chr_ref = name
1505
  			}
1506
  			y_lev = as.integer(config$FASTA_INDEXES[[strain1]][[name]])
1507
  			lines(c(0,length(unlist(genome[[name]]))), c(y_lev,y_lev))
1508
  			text( length(unlist(genome[[name]]))/2, y_lev, labels = chr_ref)	
1509
  		}
1510
  	  col=1
1511
  	  for (roi_index in 1:length(rois$chr)) {
1512
  			roi = rois[roi_index,]	
1513
  			y_lev = as.integer(roi$chr) + 0.3
1514
  			lines(c(roi$begin,roi$end), c(y_lev,y_lev), col=col)
1515
  			text( mean(c(roi$begin,roi$end)), y_lev, labels = roi_index)
1516
  	  	col = col + 1
1517
  	  }	
1518
  	}
1519
  	return(rois)
1520
  }
1521
	rois = NULL	
1522
	rois_BY_RM = get_inter_strain_rois("BY", "RM", min_cur_width = min_cur_width, diff_allowed = diff_allowed)
1523
	rois_BY_YJM = get_inter_strain_rois("BY", "YJM", min_cur_width = min_cur_width, diff_allowed = diff_allowed)
1524
	for (roi_1_index in 1:length(rois_BY_RM[,1])) {
1525
		roi_1 = rois_BY_RM[roi_1_index,]
1526
		roi_2_candidates = rois_BY_YJM[rois_BY_YJM$chr== roi_1$chr & rois_BY_YJM$begin <= roi_1$end & rois_BY_YJM$end >= roi_1$begin , ] ;  
1527
		# print(length(roi_2_candidates[,1]))
1528
		if (length(roi_2_candidates[,1]) > 0) {
1529
			for(roi_2_index in 1:length(roi_2_candidates[,1])) {
1530
				roi_2 = roi_2_candidates[roi_2_index,]
1531
				roi = list(chr=roi_1$chr, begin=max(roi_1$begin, roi_2$begin), end=min(roi_1$end, roi_2$end), strain_ref="BY")
1532
				roi[["length"]] = roi$end - roi$begin + 1
1533
				if (roi$length >= min_cur_width) {
1534
					# if (length(rois[,1]) == 153) {
1535
					# 	print(paste(length(rois[,1]), roi_1_index, roi_2_index ))
1536
					# 	print(roi_1)
1537
					# 	print(roi_2)
1538
					# 	print(roi)
1539
					# }
1540
			    rois = dfadd(rois,roi)
1541
			  }			
1542
			}
1543
		}
1544
	}
1545
	print(length(rois[,1]))
1546
	print(sum(rois$length))
1547
	rois_1st_round = rois
1548
	rois_2nd_round = NULL	
1549
	rois_RM_YJM = get_inter_strain_rois("RM", "YJM", min_cur_width = min_cur_width, diff_allowed = diff_allowed)
1550
	for (roi_1_index in 1:length(rois_1st_round[,1])) {
1551
		roi_1 = rois_1st_round[roi_1_index,]
1552
		translated_roi_1 = translate_roi(roi_1, "RM", config)
1553
		t_b = min(translated_roi_1$begin, translated_roi_1$end)
1554
		t_e = max(translated_roi_1$begin, translated_roi_1$end)
1555
		roi_2_candidates = rois_RM_YJM[rois_RM_YJM$chr== translated_roi_1$chr & rois_RM_YJM$begin <= t_e & rois_RM_YJM$end >= t_b , ] ;  
1556
		if (length(roi_2_candidates[,1]) > 0) {
1557
			for(roi_2_index in 1:length(roi_2_candidates[,1])) {
1558
				roi_2 = roi_2_candidates[roi_2_index,]
1559
				roi = list(chr=translated_roi_1$chr, begin=max(t_b, roi_2$begin), end=min(t_e, roi_2$end), strain_ref="RM")
1560
				roi[["length"]] = roi$end - roi$begin + 1
1561
				if (roi$length >= min_cur_width) {
1562
			    rois_2nd_round = dfadd(rois_2nd_round,roi)
1563
			  }			
1564
			}
1565
		}
1566
	}
1567
	print(length(rois_2nd_round[,1]))
1568
	print(sum(rois_2nd_round$length))
1569
	rois = rois_2nd_round
1570

    
1571
	rois_translator_round = list()	
1572
	for (roi_index in 1:length(rois[,1])) {
1573
		roi = rois[roi_index,]
1574
		BY_roi  = translate_roi(roi, "BY", config)
1575
		tmp_BY_roi = BY_roi 
1576
		BY_roi$begin = min(tmp_BY_roi$begin, tmp_BY_roi$end)
1577
		BY_roi$end = max(tmp_BY_roi$begin, tmp_BY_roi$end)
1578
		BY_roi$length = abs(BY_roi$length)
1579
		rois_translator_round = dfadd(rois_translator_round, BY_roi)
1580
	}
1581
	rois = rois_translator_round
1582

    
1583
	rois_3rd_round = NULL	
1584
	for (roi_index in 1:length(rois[,1])) {
1585
	# for (roi_index in 1:2) {
1586
		current_roi = rois[roi_index,]
1587
		# print(roi_index)
1588
	
1589
	  to_be_check_rois = dfadd(NULL, current_roi)
1590
		NEED_RERUN = TRUE
1591
		while (NEED_RERUN) {
1592
			# print("RERUN"),
1593
			NEED_RERUN = FALSE
1594
			to_be_check_again = NULL
1595
			for (to_be_check_roi_index in 1:length(to_be_check_rois[,1])) {
1596
				# print(to_be_check_rois)
1597
				to_be_check_roi = to_be_check_rois[to_be_check_roi_index,]
1598
		    combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM"), c("RM", "BY"), c("YJM", "BY"), c("YJM", "RM"))
1599
			  for (combi in combis) {
1600
					# print(combi)
1601
			    strain1 = combi[1]
1602
		      strain2 = combi[2]
1603
					trans_roi = translate_roi(to_be_check_roi, strain1, config)
1604
					lower_bound=min(trans_roi$begin, trans_roi$end)
1605
					upper_bound=max(trans_roi$begin, trans_roi$end)			
1606
          
1607
          check_overlaping = structure(function(strain1 = "BY", strain2 = "RM", chr = NULL, lower_bound=NULL, upper_bound=NULL) {
1608
            reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1609
          	if (strain1 == strain2) {
1610
          		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1	
1611
          		return(roi)
1612
          	}
1613
          	# Launch c2c file
1614
          	if (reverse) {
1615
          		c2c_file = list(filename=config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]])
1616
          	} else {
1617
          		c2c_file = list(filename=config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]])
1618
          	}
1619
          	c2c = get_content(c2c_file$filename, "table", stringsAsFactors=FALSE)	
1620
          	# filtering it
1621
            c2c = c2c[c2c$V6=="-",]
1622
          	# Reverse
1623
          	if (reverse) {
1624
          		tmp_col = c2c$V1
1625
          		c2c$V1 = c2c$V7
1626
          		c2c$V7 = tmp_col
1627
          		tmp_col = c2c$V2
1628
          		c2c$V2 = c2c$V9
1629
          		c2c$V9 = tmp_col
1630
          		tmp_col = c2c$V3
1631
          		c2c$V3 = c2c$V10
1632
          		c2c$V10 = tmp_col
1633
          	}
1634

    
1635
          	if (strain1 == "BY") {
1636
          		chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="")
1637
          	} else if (strain1 == "RM") {			
1638
          	  chro_1 = paste("supercontig_1.",chr,sep="")
1639
          	} else if (strain1 == "YJM") {			
1640
          	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]]
1641
          	}
1642
          	# print(chro_1)
1643
          	if (!is.null(lower_bound) & !is.null(upper_bound)) {
1644
              if (reverse) {
1645
          	  	tmp_c2c = c2c[c2c$V1==chro_1 & ((c2c$V3>=lower_bound & c2c$V2<=upper_bound & c2c$V8==1) | (c2c$V2>=lower_bound & c2c$V3<=upper_bound & c2c$V8==-1)),]
1646
          		} else {
1647
          			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V3>=lower_bound & c2c$V2<=upper_bound,]
1648
          		}
1649
          	} else {
1650
            	tmp_c2c = c2c[c2c$V1 == chr,]
1651
          	}
1652
          	if (length(tmp_c2c[,1]) > 1) {
1653
          		pbs = apply(t(1:(length(tmp_c2c[,1]) - 1)), 2, function(i){
1654
          			# print(paste(i, "/", length(tmp_c2c[,1])))
1655
          			apply(t((i+1):length(tmp_c2c[,1])), 2, function(j){
1656
          				l1 = tmp_c2c[i,] 
1657
          				b1 = min(l1$V2, l1$V3)
1658
          				e1 = max(l1$V2, l1$V3)
1659
          				l2 = tmp_c2c[j,]
1660
          				b2 = min(l2$V2, l2$V3)
1661
          				e2 = max(l2$V2, l2$V3)
1662
          				if ((e1>=b2 & b1<=e2) | (e2>=b1 & b2<=e1)) {
1663
          					print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ",chr, " [", b1, ",", e1, "] [", b2, ",", e2, "]", sep=""))				
1664
          					pb = list(strain1, strain2, chr, b1, e1, b2, e2)					
1665
          					pb
1666
          				} else {
1667
          					NULL
1668
          				}
1669
          			})
1670
          		})
1671
          		return(pbs)
1672
          	}
1673

    
1674
          }, ex=function(){
1675
          	source("src/nucleo_miner/yeast_strain_conversion.R"); 
1676
          	pbs1 = check_overlaping(strain1 = "BY", strain2 = "RM", dest=TRUE)
1677
          	pbs3 = check_overlaping(strain1 = "BY", strain2 = "YJM", dest=TRUE)
1678
          	pbs5 = check_overlaping(strain1 = "RM", strain2 = "YJM", dest=TRUE)
1679
          	pbs2 = check_overlaping(strain1 = "BY", strain2 = "RM", dest=FALSE)
1680
          	pbs4 = check_overlaping(strain1 = "BY", strain2 = "YJM", dest=FALSE)
1681
          	pbs6 = check_overlaping(strain1 = "RM", strain2 = "YJM", dest=FALSE)
1682
          })
1683
          
1684

    
1685
					res = check_overlaping(strain1 = strain1, strain2 = strain2, chr = trans_roi$chr, lower_bound=lower_bound, upper_bound=upper_bound) 
1686
					if (!is.null(res)) {
1687
						df_res = data.frame(matrix(unlist(res), ncol = 7, byrow=TRUE), stringsAsFactors=FALSE)		
1688
						interval = df_res[1,]
1689
						inter_min = as.numeric(max( min(interval$X4, interval$X5), min(interval$X6, interval$X7)))
1690
						inter_max = as.numeric(min( max(interval$X4, interval$X5), max(interval$X6, interval$X7)))
1691
						# print(paste("SPLIT ROI", roi_index, "for", combi[1], combi[2]))			
1692
						new_roi1 = trans_roi
1693
						new_roi2 = trans_roi
1694
						new_roi1$begin = lower_bound
1695
						new_roi1$end = inter_min - 1
1696
						new_roi1$length = new_roi1$end - new_roi1$begin + 1
1697
						new_roi2$begin = inter_max + 1
1698
						new_roi2$end = upper_bound
1699
						new_roi2$length = new_roi2$end - new_roi2$begin + 1
1700
						if (new_roi1$length > min_cur_width) {
1701
							BY_roi  = translate_roi(new_roi1, "BY", config)
1702
							tmp_BY_roi = BY_roi
1703
							BY_roi$begin = min(tmp_BY_roi$begin, tmp_BY_roi$end)
1704
							BY_roi$end = max(tmp_BY_roi$begin, tmp_BY_roi$end)
1705
							BY_roi$length = abs(BY_roi$length)
1706
							to_be_check_again = dfadd(to_be_check_again, BY_roi)	
1707
						}
1708
						if (new_roi2$length > min_cur_width) {
1709
							BY_roi  = translate_roi(new_roi2, "BY", config)
1710
							tmp_BY_roi = BY_roi
1711
							BY_roi$begin = min(tmp_BY_roi$begin, tmp_BY_roi$end)
1712
							BY_roi$end = max(tmp_BY_roi$begin, tmp_BY_roi$end)
1713
							BY_roi$length = abs(BY_roi$length)
1714
							to_be_check_again = dfadd(to_be_check_again, BY_roi)	
1715
						}
1716
						if (to_be_check_roi_index < length(to_be_check_rois[,1])) {
1717
							for (i in (to_be_check_roi_index + 1):length(to_be_check_rois[,1])) {
1718
								to_be_check_again = dfadd(to_be_check_again, to_be_check_rois[i,])								
1719
							}
1720
						}
1721
						NEED_RERUN = TRUE
1722
						break
1723
					}
1724
				}
1725
				if (NEED_RERUN) {
1726
					to_be_check_rois = to_be_check_again
1727
					break
1728
				}
1729
			}
1730
		}
1731
		
1732
		checked_rois = to_be_check_rois
1733
		for (checked_roi_index in 1:length(checked_rois[,1])) {
1734
			rois_3rd_round = dfadd(rois_3rd_round, checked_rois[checked_roi_index,])	
1735
		}
1736
	}
1737

    
1738
	print(length(rois_3rd_round[,1]))
1739
	print(sum(rois_3rd_round$length))
1740
	rois = rois_3rd_round
1741
	
1742
	
1743
	if (plot) {
1744
		print(paste(length(rois$chr), "area of interest."))
1745
	  # Plot rois
1746
	  fasta_ref = list(filename=config$FASTA_REFERENCE_GENOME_FILES[["BY"]])
1747
	  genome = get_content(fasta_ref$filename, "fasta")   
1748
		plot(0,0, ylim=(c(1,length(genome))), xlim = c(0, max(apply(t(genome), 2, function(chr){length(unlist(chr))}))))
1749
		for (name in names(genome)) {
1750
			if (TRUE) {
1751
				chr_ref = paste("chr", ARAB2ROM()[[config$FASTA_INDEXES[["BY"]][[name]]]], sep="")
1752
			} else {			
1753
				chr_ref = name
1754
			}
1755
			y_lev = as.integer(config$FASTA_INDEXES[["BY"]][[name]])
1756
			lines(c(0,length(unlist(genome[[name]]))), c(y_lev,y_lev))
1757
			text( length(unlist(genome[[name]]))/2, y_lev, labels = chr_ref)	
1758
		}
1759
	  col=1
1760
	  for (roi_index in 1:length(rois$chr)) {
1761
			roi = rois[roi_index,]	
1762
			y_lev = as.integer(roi$chr) + 0.3
1763
			lines(c(roi$begin,roi$end), c(y_lev,y_lev), col=col)
1764
			text( mean(c(roi$begin,roi$end)), y_lev, labels = roi_index)
1765
	  	col = col + 1
1766
	  }	
1767
	}
1768
	return (rois)
1769
}
1770

    
1771

    
1772

    
1773

    
1774

    
1775

    
1776

    
1777

    
1778

    
1779

    
1780

    
1781

    
1782

    
1783

    
1784

    
1785

    
1786

    
1787

    
1788

    
1789

    
1790

    
1791

    
1792

    
1793

    
1794

    
1795

    
1796

    
1797

    
1798

    
1799

    
1800

    
1801

    
1802

    
1803

    
1804

    
1805

    
1806

    
1807

    
1808

    
1809

    
1810

    
1811

    
1812

    
1813

    
1814

    
1815

    
1816

    
1817

    
1818

    
1819

    
1820

    
1821

    
1822

    
1823

    
1824

    
1825

    
1826

    
1827

    
1828

    
1829

    
1830

    
1831

    
1832

    
1833

    
1834

    
1835

    
1836

    
1837

    
1838

    
1839

    
1840

    
1841

    
1842
perform_anovas = function(# Performaing ANOVAs
1843
### Counts reads and Performs ANOVAS for each common nucleosomes involved.
1844
replicates, ##<< Set of replicates, each replicate is a list of samples (ideally 3). Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}. In the \emph{perform_anovas} contexte, we need 4 replicates (4 * (3 samples)): 2 strains * (1 marker + 1 input (Mnase_Seq)). 
1845
aligned_inter_strain_nucs, ##<< List of common nucleosomes.
1846
inputs_name="Mnase_Seq", ##<< Name of the input.
1847
plot_anova_boxes=FALSE ##<< Plot (or not) boxplot for each nuc.
1848
) {
1849
	anova_results = NULL
1850
	for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
1851
		inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
1852
	  # counting reads
1853
		my_data = NULL
1854
		for (replicate_rank in 1:length(replicates)) {
1855
			samples = replicates[[replicate_rank]]
1856
			strain = samples[[1]]$strain
1857
			marker = samples[[1]]$marker
1858
			for (sample_rank in 1:length(samples)) {
1859
				sample = samples[[sample_rank]]
1860
				nuc_width = as.integer(inter_strain_nuc[[paste("upper_bound_",strain,sep="")]]) - inter_strain_nuc[[paste("lower_bound_",strain,sep="")]]
1861
				nuc_reads = filter_tf_inputs(sample$inputs, inter_strain_nuc[[paste("chr_",strain,sep="")]], inter_strain_nuc[[paste("lower_bound_",strain,sep="")]], inter_strain_nuc[[paste("upper_bound_",strain,sep="")]], nuc_width - 1)
1862
				indic = sum(nuc_reads[,4]) * 1000000/sample$total_reads / nuc_width 
1863
				my_data = dfadd(my_data, list(strain = strain, marker = marker, indic = indic))
1864
			}
1865
		}
1866
		my_data$strain = as.factor(my_data$strain)
1867
		my_data$marker = as.factor(my_data$marker)
1868
	
1869
		strain_ref1 = replicates[[1]][[1]]$strain
1870
		strain_ref2 = replicates[[2]][[1]]$strain 
1871
		marker = replicates[[length(replicates)]][[1]]$marker				
1872

    
1873
	  # Collecting anova results
1874
	  anova_result = list()
1875
		# nucs info
1876
		for (name in names(inter_strain_nuc)) {
1877
			anova_result[[name]] = inter_strain_nuc[[name]]
1878
		}
1879
	  #
1880
		for (strain in c(strain_ref1, strain_ref2)) {
1881
			for (manip in c(inputs_name, marker)) {
1882
				replicat = 1
1883
				for(indic in my_data[my_data$strain == strain & my_data$marker == manip,]$indic) {
1884
					anova_result[[paste(strain, manip, replicat, sep="_")]] = indic
1885
					replicat = replicat + 1
1886
				}			
1887
			}	
1888
		}
1889

    
1890
	  # Compute ANOVAs
1891
		mnase_data = my_data[my_data$marker == inputs_name,]
1892
		mnase_aov = aov(indic~strain,mnase_data)
1893
		mnase_effects = model.tables(mnase_aov)$tables
1894
		mnase_pvalues = summary(mnase_aov)[[1]]$Pr
1895
		# boxplot(indic~ma*st,mnase_data)
1896
		anova_result[["mnase_st"]] = mnase_effects$strain[1]
1897
		anova_result[["mnase_st_pvalue"]] = mnase_pvalues[1]	
1898
		# 
1899
		marker_data = my_data[my_data$marker == marker,]
1900
		marker_aov = aov(indic~strain,marker_data)
1901
		marker_effects = model.tables(marker_aov)$tables
1902
		marker_pvalues = summary(marker_aov)[[1]]$Pr
1903
		# boxplot(indic~ma*st,marker_data)
1904
		anova_result[["marker_st"]] = marker_effects$strain[1]
1905
		anova_result[["marker_st_pvalue"]] = marker_pvalues[1]
1906
		# 
1907
		st1_data = my_data[my_data$strain == strain_ref1,]
1908
		st1_aov = aov(indic~marker,st1_data)
1909
		st1_effects = model.tables(st1_aov)$tables
1910
		st1_pvalues = summary(st1_aov)[[1]]$Pr
1911
		# boxplot(indic~ma*st,st1_data)
1912
		anova_result[["st1_ma"]]=st1_effects$ma[inputs_name]
1913
		anova_result[["st1_ma_pvalue"]]=st1_pvalues[1]
1914
		# 
1915
		st2_data = my_data[my_data$strain == strain_ref2,]
1916
		st2_aov = aov(indic~marker,st2_data)
1917
		st2_effects = model.tables(st2_aov)$tables
1918
		st2_pvalues = summary(st2_aov)[[1]]$Pr
1919
		# boxplot(indic~ma*st,st2_data)
1920
		anova_result[["st2_ma"]]=st2_effects$ma[inputs_name]
1921
		anova_result[["st2_ma_pvalue"]]=st2_pvalues[1]
1922
		# 
1923
		correl_data = my_data
1924
		correl_aov = aov(indic~strain*marker,correl_data)
1925
		correl_effects = model.tables(correl_aov)$tables
1926
		correl_pvalues = summary(correl_aov)[[1]]$Pr
1927
		if (plot_anova_boxes) {
1928
			x11()
1929
			boxplot(indic~marker*strain,correl_data)
1930
		}
1931
		anova_result[["correl_st"]]=correl_effects$strain[1]
1932
		anova_result[["correl_ma"]]=correl_effects$ma[inputs_name]
1933
		anova_result[["correl_st_ma"]]=correl_effects$"strain:marker"[1,1]
1934
		anova_result[["correl_st_pvalue"]]=correl_pvalues[1]
1935
		anova_result[["correl_ma_pvalue"]]=correl_pvalues[2]
1936
		anova_result[["correl_st_ma_pvalue"]]=correl_pvalues[3]
1937
		
1938
		anova_results = dfadd(anova_results, anova_result)
1939
	}
1940
	return(anova_results)
1941
### Returns ANOVA results and comunted reads.
1942
}
1943

    
1944

    
1945
watch_samples = function(# Watching analysis of samples 
1946
### This function allows to view analysis for a particuler region of the genome.
1947
replicates, ##<< replicates under the form...
1948
read_length, ##<< length of the reads
1949
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome.
1950
plot_arrow_raw_reads = TRUE,  ##<< Plot (or not) arrows for raw reads.
1951
plot_arrow_nuc_reads = TRUE,  ##<< Plot (or not) arrows for reads aasiocied to a nucleosome.
1952
plot_squared_reads = TRUE,  ##<< Plot (or not) reads in the square fashion.
1953
plot_coverage = FALSE,  ##<< Plot (or not) reads in the covergae fashion. fashion.
1954
plot_gaussian_reads = TRUE,  ##<< Plot (or not) gaussian model of a F anf R reads.
1955
plot_gaussian_unified_reads = TRUE,  ##<< Plot (or not) gaussian model of a nuc.
1956
plot_ellipse_nucs = TRUE,  ##<< Plot (or not) ellipse for a nuc.
1957
plot_wp_nucs = TRUE,  ##<< Plot (or not) cluster of nucs
1958
plot_wp_nuc_model = TRUE,  ##<< Plot (or not) gaussian model for a cluster of nucs
1959
plot_common_nucs = TRUE,  ##<< Plot (or not) aligned reads.
1960
plot_anovas = FALSE,  ##<< Plot (or not) scatter for each nuc.
1961
plot_anova_boxes =  FALSE,  ##<< Plot (or not) boxplot for each nuc.
1962
plot_wp_nucs_4_nonmnase = FALSE,  ##<< Plot (or not) clusters for non inputs samples.
1963
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed. 
1964
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed.
1965
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis.
1966
config=NULL ##<< GLOBAL config variable
1967
){
1968
  returned_list = list()
1969
  # Computing global display parameters
1970
  if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) {
1971
	  x_min_glo = replicates[[1]][[1]]$roi[["begin"]]
1972
	  x_max_glo = replicates[[1]][[1]]$roi[["end"]]
1973
  } else {
1974
	  x_min_glo = - replicates[[1]][[1]]$roi[["begin"]]
1975
	  x_max_glo = - replicates[[1]][[1]]$roi[["end"]]
1976
  }
1977
	base_glo = 0
1978
	nb_rank_glo = 0
1979
  for (samples in replicates) {
1980
  	nb_rank_glo = nb_rank_glo + length(samples)
1981
  }
1982
	ylim_glo = c(base_glo, base_glo + height * nb_rank_glo)
1983
	y_min_glo = min(ylim_glo)
1984
	y_max_glo = max(ylim_glo)
1985
  delta_y_glo = y_max_glo - y_min_glo
1986
  # Plot main frame
1987
  plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab="#reads (per million reads)", xlab=paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr), main="NucleoMiner2" )
1988
  axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2))
1989
  # Go
1990
	replicates_wp_nucs = list()
1991
  for (replicate_rank in 1:length(replicates)) {
1992
		# Computing replicate parameters
1993
		nb_rank = length(samples)
1994
		base = (replicate_rank-1) * height * nb_rank
1995
		ylim = c(base, base + height * nb_rank)
1996
		y_min = min(ylim)
1997
		y_max = max(ylim)
1998
	  delta_y = y_max - y_min
1999
		samples = replicates[[replicate_rank]]
2000
		for (sample_rank in 1:length(samples)) {
2001
			# computing sample parameters
2002
			sample = samples[[sample_rank]]
2003
			y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank
2004
			text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep=""))
2005

    
2006
		  if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2007
			  x_min = sample$roi[["begin"]]
2008
			  x_max = sample$roi[["end"]]
2009
		  } else {
2010
			  x_min = - sample$roi[["begin"]]
2011
			  x_max = - sample$roi[["end"]]
2012
		  }		
2013
			shift = x_min_glo - x_min
2014
	    # Plot Genome seq
2015
			if (plot_ref_genome) {
2016
		  	text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier")
2017
		  }
2018
			# Plot reads
2019
			reads = sample$inputs
2020
			signs = sign_from_strand(reads[,3])	
2021
			if (plot_arrow_raw_reads) {
2022
				arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, 
2023
				col=1, length=0.15/nb_rank)          	
2024
			}
2025
	    if (plot_squared_reads) {    	
2026
        # require(plotrix)
2027
				rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift,  y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0)
2028
			}
2029
	    if (plot_coverage) {    	
2030
        if (length(reads[,1]) != 0) {
2031
          reads[1,1] = 0  
2032
          reads[1,2] = 0
2033
          reads[1,3] = 0
2034
          reads[1,4] = 0
2035
          signs = c(1) 
2036
          step_h = sign(x_min) * signs * reads[,4]
2037
          step_b = sign(x_min) * reads[,2] + shift
2038
          step_e = sign(x_min) * (reads[,2] + signs * 150) + shift
2039
          steps_x = min(step_b, step_e):max(step_b, step_e)          
2040
          steps_y = rep(0, length(steps_x))
2041
          for (i in 1:length(step_h)) {          
2042
            steps_y[which(steps_x==min(step_b[i], step_e[i]))] =  steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i])
2043
            steps_y[which(steps_x==max(step_b[i], step_e[i]))] =  steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i])
2044
          }
2045
          tmp_index = which(steps_y != 0)
2046
          steps_x = steps_x[tmp_index]
2047
          steps_y = steps_y[tmp_index]
2048
          tmp_current_level = 0
2049
          for (i in 1:length(steps_y)) {          
2050
            steps_y[i] = tmp_current_level + steps_y[i]
2051
            tmp_current_level = steps_y[i]
2052
          }
2053
          steps_y = c(0, steps_y)
2054
          steps_y = steps_y * 1000000/sample$total_reads
2055
        } else {
2056
          steps_y = c(0, 0, 0)
2057
          steps_x = c(sample$roi$begin, sample$roi$end)
2058
        }
2059
        lines(stepfun(steps_x, steps_y + y_lev), pch="")
2060
        abline(y_lev,0)
2061
        returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y)
2062
			}
2063
			# Plot nucs
2064
	    if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) {
2065
				nucs = sample$outputs
2066
				if (length(nucs$center) > 0) {
2067
					col = 1
2068
		      for (i in 1:length(nucs$center)) {
2069
						col = col + 1
2070
		        nuc = nucs[i,]
2071
						involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width)
2072
				  	involved_signs = apply(t(involved_reads[,3]), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
2073
						total_involved_reads = sum(involved_reads[,4]) 
2074
						if (plot_arrow_nuc_reads ) {
2075
							arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, 
2076
							col=col, length=0.15/nb_rank)          	
2077
						}
2078
	          if (plot_gaussian_reads | plot_gaussian_unified_reads) {
2079
  						flatted_reads = flat_reads(involved_reads, nuc$width)
2080
	  					delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
2081
		  			}
2082
	          if (plot_gaussian_reads ) {
2083
							flatted_reads = flat_reads(involved_reads, nuc$width)
2084
							delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
2085
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) + y_lev, col=col)
2086
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) + y_lev, col=col)
2087
	          }
2088
	          if (plot_gaussian_unified_reads ) {
2089
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) + y_lev, col=col, lty=2)
2090
	          }
2091
	          if (plot_ellipse_nucs) {
2092
				      # require(plotrix)
2093
	  	 				draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width, border=col) 
2094
						}
2095
		      }
2096
		    } else {
2097
		      print("WARNING! No nucs to print.")
2098
		    }
2099
			}
2100
	  }
2101
	  # Plot wp nucs
2102
		if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & plot_wp_nucs) {
2103
			if (samples[[1]]$marker == "Mnase_Seq") {
2104
				if (is.null(aggregated_intra_strain_nucs)) {
2105
	  			wp_nucs = aggregate_intra_strain_nucs(samples)[[1]]			
2106
				} else {
2107
					wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]]
2108
				}
2109
		  } else {
2110
  			wp_nucs = replicates_wp_nucs[[replicate_rank-2]]
2111
		  }
2112
			replicates_wp_nucs[[replicate_rank]] = wp_nucs
2113
			for (wp_nuc in wp_nucs) {
2114
				if (wp_nuc$wp){
2115
					rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1)
2116
					all_original_reads = c()
2117
					for(initial_nuc in wp_nuc$nucs) {
2118
						all_original_reads = c(all_original_reads, initial_nuc$original_reads)						
2119
					}
2120
					delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound
2121
					if (FALSE) {
2122
					  rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col="#EEEEEE", border=1)
2123
				  }
2124
					if (plot_wp_nuc_model) {
2125
					  lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) + y_min, col=1)				
2126
				  }
2127
				}
2128
			}
2129
		}
2130
	}	
2131
  
2132
	if (plot_common_nucs | plot_anovas | plot_anova_boxes) {
2133
		if (is.null(aligned_inter_strain_nucs)) {
2134
			aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]]
2135
		}
2136
		if (plot_common_nucs) {
2137
      #Plot common wp nucs
2138
      mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list()
2139
            for (replicate_rank in 1:length(replicates)) {
2140
        nb_rank[[replicate_rank]] = length(samples)
2141
        base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]]
2142
        ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]])
2143
        y_min[[replicate_rank]] = min(ylim[[replicate_rank]])
2144
        y_max[[replicate_rank]] = max(ylim[[replicate_rank]])
2145
        delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]]
2146
        mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2
2147
      
2148
        samples = replicates[[replicate_rank]]
2149
        for (sample_rank in 1:length(samples)) {
2150
          sample = samples[[sample_rank]]
2151
          y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]]
2152
          if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2153
            x_min[[replicate_rank]] = sample$roi[["begin"]]
2154
            x_max[[replicate_rank]] = sample$roi[["end"]]
2155
          } else {
2156
            x_min[[replicate_rank]] = - sample$roi[["begin"]]
2157
            x_max[[replicate_rank]] = - sample$roi[["end"]]
2158
          }
2159
          shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]]
2160
        }
2161
      }  
2162
      for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
2163
        inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
2164
        tmp_xs = tmp_ys = c()
2165
        for (replicate_rank in 1:length(replicates)) {
2166
          samples = replicates[[replicate_rank]]
2167
          strain = samples[[1]]$strain
2168
          tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]])
2169
          tmp_ys = c(tmp_ys, mid_y[[replicate_rank]])
2170
        }
2171
        tmp_ys <<- tmp_ys
2172
        tmp_xs <<- tmp_xs
2173
        lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]]), cex=dev.size()[1]*200/(x_max[[1]]-x_min[[1]]), pch=19)          
2174
      }    
2175
		}
2176
		if (plot_anovas | plot_anova_boxes) {
2177
			anova_results = perform_anovas(replicates, aligned_inter_strain_nucs, plot_anova_boxes=plot_anova_boxes)
2178
			thres = FDR(anova_results[,"correl_st_ma_pvalue"],0.05)
2179
			if (is.na(thres)) {
2180
				# Boneferroni multiple test
2181
				thres = 0.05/length(anova_results[,1])
2182
			}
2183
			filtred_anova_results = anova_results[anova_results[,"correl_st_ma_pvalue"]<thres,]
2184
			x11()
2185
			if (plot_anovas) {
2186
			  plot(anova_results[,"mnase_st"],anova_results[,"correl_st_ma"], pch=1, xlim=c(-0.07,0.07), ylim=c(-0.07,0.07),main="SNEPs" ,xlab = "strain effect", ylab = "snep effect")
2187
	      points(x=filtred_anova_results[,"mnase_st"],y=filtred_anova_results[,"correl_st_ma"],col=2, pch="+")
2188
		  }
2189
 	  }
2190
	}
2191
  return(returned_list)
2192
}
2193

    
2194

    
2195

    
2196