Statistiques
| Branche: | Révision :

root / src / R / nucleominer.R @ 7646593d

Historique | Voir | Annoter | Télécharger (92,22 ko)

1
# get_content = function(# Get content from cached 
2
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
3
# obj, ##<< The object that we want its content.
4
# ... ##<< Parameters that will be passed to my_read
5
# ) {
6
#   UseMethod("get_content", obj)
7
#   ### Returns the cached content of the current object. 
8
# }
9
# 
10
# get_content.default = structure( function(# Get content from cached 
11
# ### Acces to the cached content of a file via the global variable NM_CACHE. Load content in NM_CACHE if needed.
12
# obj, ##<< The object that we want its content.
13
# ... ##<< Parameters that will be passed to my_read
14
# ) {
15
#   if (inherits(try(NM_CACHE,TRUE), "try-error") || is.null(NM_CACHE)) {
16
#     NM_CACHE <<- list()
17
#   }
18
#   if (is.null(NM_CACHE[[obj$filename]])) {
19
#     print(paste("Loading file ",obj$filename, sep=""))
20
#     tmp_content = my_read(obj, ...)
21
#     print("affect it...")
22
#     NM_CACHE[[obj$filename]] <<- tmp_content
23
#     print("done.")
24
#   }  
25
#   return(NM_CACHE[[obj$filename]])
26
# ### Returns the cached content of the current object. 
27
# }, ex=function(){
28
#   # Create a dataframe
29
#   df = NULL
30
#   df = dfadd(df, list(key1 = "value1", key2 = "value2"))
31
#   df = dfadd(df, list(key1 = "value1'", key2 = "value3'"))
32
#   # Dump it into tmp file
33
#   write.table(df, file="/tmp/tmp.dump.table.tmp")
34
#   # Load it into cache using feature.
35
#   cached_content_object  = list(filename="/tmp/tmp.dump.table.tmp")
36
#   class(cached_content_object) = "table"  
37
#   # First time it will be load into cache
38
#   print(get_content(cached_content_object))
39
#   # Second time not
40
#   print(get_content(cached_content_object))
41
#   
42
# })
43
#   
44
# my_read = function(
45
# ### Abstract my_read function.  
46
#   obj, ...) {
47
#   UseMethod("my_read", obj)
48
# }
49
#   
50
# my_read.default = function(
51
# ### Default my_read function.  
52
#   obj, ...){
53
#   stop(paste("ERROR, my_read is not defined for any Objects (file ", obj$filename," )", sep=""))      
54
# }
55
# 
56
# my_read.fasta = function(
57
# ### my_read function for fasta files.  
58
#   obj, ...){
59
#   # require(seqinr)
60
#   return(read.fasta(obj$filename, ...))
61
# }
62
# 
63
# my_read.table = function(
64
# ### my_read function for table files.  
65
#   obj, ...){
66
#   if (rev(unlist(strsplit(obj$filename, ".", fixed=TRUE)))[1] == "gz") {
67
#     return(read.table(file=gzfile(obj$filename), ...))      
68
#   } else { 
69
#     return(read.table(file=obj$filename, ...))
70
#   }
71
# }  
72
# 
73
# my_read.cvs = function(
74
# ### my_read function for cvs files.  
75
#   obj, ...){
76
#   return(read.csv(file=obj$filename, ...))
77
# }  
78

    
79

    
80

    
81

    
82

    
83

    
84

    
85

    
86

    
87
FDR = structure(function#  False Discovery Rate
88
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper
89
##author<< Gael Yvert, 
90
(
91
x, ##<< A vector x of independent p-values.
92
FDR ##<< The specified FDR.
93
) {
94
  x <- sort(na.omit(x))
95
  N = length(x)
96
  i = 1;
97
  while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR
98
  if (i == 1)
99
    return (NA)
100
  else
101
    return( x[i-1] )
102
### Return the the corresponding cutoff.
103
}, ex=function(){
104
  print("example")
105
})
106

    
107

    
108

    
109
lod_score_vecs = structure(function # Likelihood ratio
110
### Compute the likelihood log of two set of value from two models Vs. a unique model.
111
(
112
x ,##<< First vector.
113
y ##<< Second vector.
114
) {
115
	if (length(x) <=1 | length(y) <= 1) {
116
		return(NA)
117
	}
118
  meanX = mean(x)
119
  sdX = sd(x)
120
  meanY = mean(y)
121
  sdY = sd(y)
122
  meanXY = mean(c(x,y))
123
  sdXY = sd(c(x,y))
124
  llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
125
  llY = sum(log(dnorm(y,mean=meanY,sd=sdY)))
126
  llXY = sum(log(dnorm(c(x,y),mean=meanXY,sd=sdXY)))
127
  ratio = llX + llY - llXY
128
  return(ratio)
129
### Returns the likelihood ratio.
130
}, ex=function(){
131
  # LOD score for 2 set of values
132
  mean1=5; sd1=2; card2 = 250
133
  mean2=6; sd2=3; card1 = 200
134
  x1 = rnorm(card1, mean1, sd1)
135
  x2 = rnorm(card2, mean2, sd2)  
136
  min = floor(min(c(x1,x2)))
137
  max = ceiling(max(c(x1,x2)))
138
  hist(c(x1,x2), xlim=c(min, max), breaks=min:max)
139
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
140
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
141
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
142
  lod_score_vecs(x1,x2)
143
 })
144

    
145
dfadd = structure(function# Adding list to a dataframe.
146
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}.
147
	(df, ##<<  A dataframe
148
		l ##<<  A list
149
	) {
150
  if (is.null(df)) {
151
    df = data.frame(l,stringsAsFactors=FALSE)
152
  } else {
153
    df = rbind(df, data.frame(l,stringsAsFactors=FALSE))
154
  }
155
  return(df)
156
### Return the dataframe \emph{df}.
157
}, ex=function(){
158
		## Here dataframe is NULL
159
		print(df)
160
		df = NULL
161
		
162
		# Initialize df
163
		df = dfadd(df, list(key1 = "value1", key2 = "value2"))
164
		print(df)
165
		
166
		# Adding elements to df
167
		df = dfadd(df, list(key1 = "value1'", key2 = "value2'"))
168
		print(df)
169
})
170

    
171

    
172
sign_from_strand = function(
173
### Get the sign of strand 
174
strands) {
175
	apply(t(strands), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
176
### If strand in forward then returns 1 else returns -1
177
}
178

    
179
flat_reads = function(
180
### Extract reads coordinates from TempleteFilter input sequence
181
reads, ##<< TemplateFilter input reads 
182
nuc_width ##<< Width used to shift F and R reads.
183
) {
184
	F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])}))
185
	R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])}))
186
	flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads))  )
187
	return(list(F_flatted_reads, R_flatted_reads, flatted_reads))
188
### Returns a list of F reads, R reads and joint/shifted F and R reads.
189
}
190

    
191

    
192

    
193
filter_tf_outputs = function(# Filter TemplateFilter outputs
194
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlap threshold.
195
tf_outputs, ##<< TemplateFilter outputs.
196
chr, ##<< Chromosome observed, here chr is an integer.
197
x_min, ##<< Coordinate of the first bp observed.
198
x_max, ##<< Coordinate of the last bp observed.
199
nuc_width = 160, ##<< Nucleosome width.
200
ol_bp = 59, ##<< Overlap Threshold.
201
corr_thres = 0.5 ##<< Correlation threshold.
202
) {
203
  if (x_min < 0) {
204
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center > (-x_max - nuc_width/2) & tf_outputs$center <  (-x_min + nuc_width/2),]
205
	} else {
206
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center > (x_min - nuc_width/2) & tf_outputs$center < (x_max + nuc_width/2),]
207
  }
208
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
209
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
210
  tf_outputs = tf_outputs[tf_outputs$correlation >= corr_thres,]  
211
  tf_outputs = tf_outputs[order(tf_outputs$correlation,decreasing=TRUE),]  
212
  i = 1
213
  while (i <= length(tf_outputs[,1])) {    
214
    lb = tf_outputs[i,]$low 
215
    ub = tf_outputs[i,]$up
216
    tf_outputs = tf_outputs[!(tf_outputs$low <= (ub-ol_bp) & tf_outputs$up > ub) & !(tf_outputs$up >= (lb+ol_bp) & tf_outputs$low < lb),]
217
    i = i+1
218
  }
219
  return(tf_outputs)
220
### Returns filtered TemplateFilter Outputs
221
}
222

    
223

    
224

    
225
filter_tf_inputs = function(# Filter TemplateFilter inputs
226
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
227
inputs, ##<< TF inputs to be filtered.
228
chr, ##<< Chromosome observed, here chr is an integer.
229
x_min, ##<< Coordinate of the first bp observed.
230
x_max, ##<< Coordinate of the last bp observed.
231
nuc_width = 160, ##<< Nucleosome width.
232
only_f = FALSE, ##<< Filter only F reads.
233
only_r = FALSE, ##<< Filter only R reads.
234
filter_for_coverage = FALSE ##<< Does it filter for plot coverage?
235
) {
236
	if (only_f) {
237
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
238
	} else if (only_r) {
239
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]			
240
	} else {
241
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
242
	}
243
  if (!filter_for_coverage) {
244
    corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
245
    inputs = inputs[inputs[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]    
246
  }
247
	return(inputs)
248
### Returns filtred inputs.
249
}
250

    
251

    
252

    
253
get_comp_strand = function(
254
### Compute the complementatry strand.
255
strand ##<< The original strand.
256
) {
257
	apply(t(strand),2, function(n){
258
	  if (n=="a") {return("t")} 
259
		if (n=="t") {return("a")}
260
		if (n=="c") {return("g")} 
261
		if (n=="g") {return("c")} 
262
	})
263
### Returns the complementatry strand.
264
}
265

    
266

    
267
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes. 
268
### This function aggregates nucleosome for replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center. Adajacent nucleosomes are compared two by two. Comparison is based on a log likelihood ratio score. The issue of comparison is adjacents nucleosomes merge or separation. Finally the function returns a list of clusters and all computed \emph{lod_scores}. Each cluster ows an attribute \emph{wp} for "well positionned". This attribute is set as \emph{TRUE} if the cluster is composed of exactly one nucleosomes of each sample.
269
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
270
lod_thres=20, ##<< Log likelihood ration threshold.
271
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
272
){
273
	end_of_tracks = function(tracks) {
274
		if (length(tracks) == 0) {
275
			return(TRUE)
276
		}
277
	  for (lower_bound in tracks) {
278
			if(!is.na(lower_bound)) {
279
	      if (lower_bound < coord_max) {
280
	        return(FALSE)
281
	      }
282
	  	}
283
	  }
284
	  return(TRUE)
285
	}
286
	store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,nb_tracks, min_nuc_center, max_nuc_center) {
287
		if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) {
288
			new_cluster$wp = TRUE
289
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
290
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
291
		  	clusters[[length(clusters) + 1]] = new_cluster
292
				# print(new_cluster)
293
		  }
294
		} else {
295
			new_cluster$wp = FALSE
296
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
297
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
298
			  clusters[[length(clusters) + 1]] = new_cluster
299
			}
300
		}
301
		return(clusters)
302
	}
303
	strain = samples[[1]]$strain
304
	lod_scores = c()
305
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
306
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
307
  # compute clusters
308
  clusters = list()
309
  cluster_contents = list()
310
  # Init reader
311
  indexes = c()
312
  track_readers = c()
313
  current_nuc = NULL
314
	lod_score = lod_thres + 1 
315
  # Read nucs from TF outputs  
316
  tf_outs = list()
317
	i = 1
318
  for (sample in samples) {
319
		# print(sample$roi$chr)
320
		# print(min_nuc_center)
321
		# print(max_nuc_center)
322
		# print(sample$outputs)
323
		# tf_outs[[i]] = filter_tf_outputs(sample$outputs, sample$roi$chr, min_nuc_center, max_nuc_center)
324
		# print(tf_outs[[i]])
325
		tf_outs[[i]] = sample$outputs
326
		tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),]  
327
    indexes[i] = 1
328
		if (is.na(tf_outs[[i]][indexes[i],]$center)) {
329
      track_readers[i] = coord_max				
330
	  } else {
331
      track_readers[i] = tf_outs[[i]][indexes[i],]$center				
332
		}
333
		i = i+1		
334
  }
335
	# print(track_readers)
336
  new_cluster = NULL
337
  nb_nucs_in_cluster = 0
338
  nuc_from_track = c()
339
  for (i in 1:length(tf_outs)){
340
    nuc_from_track[i] = FALSE
341
  }
342
  # Start clustering
343
  while (!end_of_tracks(track_readers)) {
344
    new_center = min(track_readers)
345
		current_track = which(track_readers == new_center)[1]			
346
    new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],])
347
		new_nuc$chr = substr(new_nuc$chr,4,1000000L)
348
		new_nuc$inputs = samples[[current_track]]$inputs
349
		new_nuc$chr = samples[[current_track]]$roi$chr
350
		new_nuc$track = current_track
351
		
352
		new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width) 
353
		flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width)
354
		new_nuc$original_reads = flatted_reads[[3]]
355

    
356
    new_upper_bound = new_nuc$upper_bound
357

    
358
    if (!is.null(current_nuc)) {
359
			lod_score = lod_score_vecs(current_nuc$original_reads,new_nuc$original_reads)
360
			lod_scores = c(lod_scores,lod_score)
361
		}
362
		# print(paste(lod_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
363
		if (is.na(lod_score)) {
364
			lod_score = lod_thres + 1 
365
		}
366
		# Store lod_score
367
		new_nuc$lod_score = lod_score 			
368
	  if (lod_score < lod_thres) {
369
      # aggregate to current cluster
370
      #   update bound
371
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
372
        new_cluster$upper_bound = new_nuc$upper_bound      
373
      }
374
      if (new_nuc$lower_bound < new_cluster$lower_bound) {
375
        new_cluster$lower_bound = new_nuc$lower_bound         
376
      }
377
      #   add nucleosome to current cluster
378
      nuc_from_track[current_track] = TRUE
379
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1 
380
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
381
    } else {
382
			if (!is.null(new_cluster)) {
383
        # store old cluster
384
	      clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
385
			}
386
      # Reinit current cluster composition stuff
387
      nb_nucs_in_cluster = 0
388
      nuc_from_track = c()
389
      for (i in 1:length(tf_outs)){
390
        nuc_from_track[i] = FALSE
391
      }
392
      # create new cluster        
393
      new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list())
394
      # update upper bound
395
      current_upper_bound = new_upper_bound
396
      # add nucleosome to current cluster
397
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1 
398
      nuc_from_track[current_track] = TRUE
399
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
400
				
401
		}
402
			
403
		current_nuc = new_nuc
404

    
405
    # update indexes
406
    if (indexes[current_track] < length(tf_outs[[current_track]]$center)) {      
407
      indexes[current_track] = indexes[current_track] + 1
408
      # update track
409
      track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center
410
    } else {
411
      # update track
412
      track_readers[current_track] = coord_max
413
    }
414
  }
415
  # store last cluster
416
  if (!is.null(new_cluster)) {
417
    # store old cluster
418
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster,nuc_from_track,length(tf_outs),min_nuc_center, max_nuc_center)
419
  }
420
	return(list(clusters, lod_scores))
421
### Returns a list of clusterized nucleosomes, and all computed lod scores.
422
}, ex=function(){	
423
	# Dealing with a region of interest
424
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
425
	samples = list()
426
	for (i in 1:3) {
427
		# Create TF output
428
		tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
429
		outputs = dfadd(NULL,tf_nuc)
430
		outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
431
		# Generate corresponding reads
432
		nb_reads = round(runif(1,170,230))
433
		reads = round(rnorm(nb_reads, tf_nuc$center,20))
434
		u_reads = sort(unique(reads))
435
		strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
436
		counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
437
		shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
438
		u_reads = u_reads + shifts
439
		inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)), 
440
		                         "V2" = u_reads, 
441
														 "V3" = strands, 
442
														 "V4" = counts), stringsAsFactors=FALSE)
443
		samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
444
	}
445
	print(aggregate_intra_strain_nucs(samples))
446
})
447

    
448
flat_aggregated_intra_strain_nucs = function(# to flat aggregate_intra_strain_nucs function output
449
### This function builds a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
450
partial_strain_maps, ##<< the output of aggregate_intra_strain_nucs function
451
roi_index ##<< the index of the roi involved
452
) {
453
	if  (length(partial_strain_maps) == 0 ){
454
		print(paste("Empty partial_strain_maps for roi", roi_index, "ands strain", strain, "." ))
455
    tmp_strain_maps = list()  
456
	} else {
457
		tmp_strain_map = apply(t(1:length(partial_strain_maps)), 2, function(i){
458
			tmp_nuc = partial_strain_maps[[i]]
459
			tmp_nuc_as_list = list()
460
			tmp_nuc_as_list[["chr"]] = tmp_nuc[["chr"]]
461
			tmp_nuc_as_list[["lower_bound"]] = ceiling(tmp_nuc[["lower_bound"]])
462
			tmp_nuc_as_list[["upper_bound"]] = floor(tmp_nuc[["upper_bound"]])
463
			tmp_nuc_as_list[["roi_index"]] = roi_index
464
			tmp_nuc_as_list[["index_nuc"]] = i
465
			tmp_nuc_as_list[["wp"]] = as.integer(tmp_nuc$wp)
466
			all_original_reads = c()
467
			for (j in 1:length(tmp_nuc$nucs)) {
468
				all_original_reads = c(all_original_reads, tmp_nuc$nucs[[j]]$original_reads)
469
			}
470
			tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads)
471
			if (tmp_nuc$wp) {
472
				tmp_nuc_as_list[["lod_1"]] = signif(tmp_nuc$nucs[[2]]$lod_score,5)
473
				tmp_nuc_as_list[["lod_2"]] = signif(tmp_nuc$nucs[[3]]$lod_score,5)
474
			} else {
475
				tmp_nuc_as_list[["lod_1"]] = NA
476
				tmp_nuc_as_list[["lod_2"]] = NA							
477
			}
478
      return(tmp_nuc_as_list)
479
    })
480
    tmp_strain_maps = do.call("rbind", tmp_strain_map)
481
	}
482
  return(data.frame(tmp_strain_maps))
483
### Returns a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
484
}
485

    
486
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains.
487
### This function aligns nucs between two strains for a given genome region.
488
replicates, ##<< Set of replicates, ideally 3 per strain. 
489
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's null this list will be computed.
490
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's null this list will be computed.
491
corr_thres=0.5, ##<< Correlation threshold.
492
lod_thres=100, ##<< LOD cut off.
493
config=NULL, ##<< GLOBAL config variable
494
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.	
495
) {
496
	if (length(replicates) < 2) {
497
		stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.")
498
	} else if (length(replicates) > 2) {
499
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")			
500
	}
501
	common_nuc = NULL
502
	lod_scores = c()
503
	chr = replicates[[1]][[1]]$roi$chr
504
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
505
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
506
	strain_ref1 = replicates[[1]][[1]]$strain
507
	strain_ref2 = replicates[[2]][[1]]$strain
508
	big_roi = replicates[[1]][[1]]$roi
509
  orig_big_roi = replicates[[1]][[1]]$orig_roi
510
	if(big_roi$end - big_roi$begin < 0) {
511
		tmp_begin = big_roi$begin
512
		big_roi$begin =  big_roi$end
513
		big_roi$end =  tmp_begin
514
	}
515
	# GO!
516
	if (is.null(wp_nucs_strain_ref1)) {
517
		wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]]
518
	}
519
	if (is.null(wp_nucs_strain_ref2)) {
520
	  wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]]
521
  }
522
  # foo <<- wp_nucs_strain_ref1
523
  # print(apply(t(wp_nucs_strain_ref1), 2, function(l){c(l[[1]]$lower_bound, l[[1]]$upper_bound, l[[1]]$wp)}))
524
  # print(apply(t(wp_nucs_strain_ref2), 2, function(l){c(l[[1]]$lower_bound, l[[1]]$upper_bound, l[[1]]$wp)}))
525
	# dealing with matching_nas
526
	lws = c()
527
	ups = c()
528
	for (na in wp_nucs_strain_ref2) {
529
		lws = c(lws, na$lower_bound)
530
		ups = c(ups, na$upper_bound)
531
	}
532

    
533
	print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep=""))			
534
	roi_strain_ref1 = NULL
535
	roi_strain_ref2 = NULL
536
	if (length(wp_nucs_strain_ref1) > 0) {
537
		for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){
538
			# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep=""))
539
			nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]]
540
			# Filtering on Well Positionned
541
			if (nuc_strain_ref1$wp) {
542
				roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1)
543
				roi_strain_ref2 = translate_roi(roi_strain_ref1, strain_ref2, big_roi=orig_big_roi, config=config) 
544
        if (!is.null(roi_strain_ref2)){
545
					# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER)
546
					lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin)
547
					upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin)
548
					matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2)
549
					for (index_nuc_strain_ref2 in matching_nas) {
550
						nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]]
551
						# Filtering on Well Positionned
552
    				nuc_strain_ref2_to_roi = list(begin=nuc_strain_ref2$lower_bound, end=nuc_strain_ref2$upper_bound, chr=nuc_strain_ref2$chr, strain_ref = strain_ref2)             
553
						if (!is.null(translate_roi(nuc_strain_ref2_to_roi, strain_ref1, big_roi=orig_big_roi, config=config)) &  
554
                nuc_strain_ref2$wp) {
555
							# Filtering on correlation Score and collecting reads					
556
							SKIP = FALSE
557
							# TODO: This for loop could be done before working on strain_ref2. Isn't it?
558
							reads_strain_ref1 = c()
559
							for (nuc in nuc_strain_ref1$nucs){
560
								reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads)							
561
								if (nuc$corr < corr_thres) {
562
									SKIP = TRUE
563
								}
564
							}
565
							reads_strain_ref2 = c()
566
							for (nuc in nuc_strain_ref2$nucs){
567
								reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads)							
568
								if (nuc$corr < corr_thres) {
569
									SKIP = TRUE
570
								}
571
							}
572
							# Filtering on correlation Score						
573
							if (!SKIP) {
574
								# tranlation of reads into strain 2 coords
575
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
576
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
577
								lod_score = lod_score_vecs(reads_strain_ref1, reads_strain_ref2)
578
								lod_scores = c(lod_scores, lod_score)
579
								# Filtering on LOD Score						
580
								if (lod_score < lod_thres) {
581
									tmp_nuc = list()
582
									# strain_ref1
583
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
584
									tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound
585
									tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound
586
									tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5)
587
									tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5)
588
									tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1)
589
									tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1
590
									# tmp_nuc[[paste("corr1_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[1]]$corr,5)
591
									# tmp_nuc[[paste("corr2_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[2]]$corr,5)
592
									# tmp_nuc[[paste("corr3_", strain_ref1, sep="")]] = signif(nuc_strain_ref1$nucs[[3]]$corr,5)
593
									# strain_ref2
594
									tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr
595
									tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound
596
									tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound
597
									tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5)
598
									tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5)
599
									tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2)
600
									tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2
601
									# tmp_nuc[[paste("corr1_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[1]]$corr,5)
602
									# tmp_nuc[[paste("corr2_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[2]]$corr,5)
603
									# tmp_nuc[[paste("corr3_", strain_ref2, sep="")]] = signif(nuc_strain_ref2$nucs[[3]]$corr,5)
604
									# common
605
									tmp_nuc[["lod_score"]] = signif(lod_score,5)
606
									# print(tmp_nuc)
607
									common_nuc = dfadd(common_nuc, tmp_nuc)
608
								}
609
							}
610
						}
611
					}				
612
				} else {
613
		      print("WARNING! No roi for strain ref 2.")
614
			  }
615
		  }
616
		}
617
		
618
		if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) {
619
			index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] ==  common_nuc[,1:3][-1,] ,1,sum) == 3)
620
			to_remove_list = c()			
621
			for (i in 1:length(index_redundant)) {	
622
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
623
				  to_remove = index_redundant[i]
624
				}	 else {
625
					to_remove = index_redundant[i] + 1
626
			  } 		
627
				to_remove_list = c(to_remove_list, to_remove)
628
			}
629
			common_nuc = common_nuc[-to_remove_list,]
630
		}
631
		
632
		if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) {
633
			index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] == common_nuc[,8:10][-1,] ,1,sum) == 3)
634
			to_remove_list = c()			
635
			for (i in 1:length(index_redundant)) {	
636
				if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
637
				  to_remove = index_redundant[i]
638
				}	 else {
639
					to_remove = index_redundant[i] + 1
640
			  } 		
641
				to_remove_list = c(to_remove_list, to_remove)
642
			}
643
			common_nuc = common_nuc[-to_remove_list,]
644
		}
645
				
646
		return(list(common_nuc, lod_scores))
647
	} else {
648
		print("WARNING, no nucs for strain_ref1.")
649
		return(NULL)
650
	}
651
### Returns a list of clusterized nucleosomes, and all computed lod scores.
652
}, ex=function(){	
653

    
654
    # Define new translate_roi function...
655
    translate_roi = function(roi, strain2, big_roi=NULL, config=NULL) {
656
      return(roi)
657
    }
658
    # Binding it by uncomment follwing lines.
659
    unlockBinding("translate_roi", as.environment("package:nucleominer"))
660
    unlockBinding("translate_roi", getNamespace("nucleominer"))
661
    assign("translate_roi", translate_roi, "package:nucleominer")
662
    assign("translate_roi", translate_roi, getNamespace("nucleominer"))
663
    lockBinding("translate_roi", getNamespace("nucleominer"))
664
    lockBinding("translate_roi", as.environment("package:nucleominer"))  
665

    
666
	# Dealing with a region of interest
667
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1")
668
	roi2 = translate_roi(roi, roi$strain_ref1)
669
	replicates = list()
670
	for (j in 1:2) {
671
		samples = list()
672
		for (i in 1:3) {
673
			# Create TF output
674
			tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
675
			outputs = dfadd(NULL,tf_nuc)
676
			outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
677
			# Generate corresponding reads
678
			nb_reads = round(runif(1,170,230))
679
			reads = round(rnorm(nb_reads, tf_nuc$center,20))
680
			u_reads = sort(unique(reads))
681
			strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
682
			counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
683
			shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
684
			u_reads = u_reads + shifts
685
			inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)), 
686
			                         "V2" = u_reads, 
687
															 "V3" = strands, 
688
															 "V4" = counts), stringsAsFactors=FALSE)
689
			samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
690
		}
691
		replicates[[length(replicates) + 1]] = samples
692
	}
693
	print(align_inter_strain_nucs(replicates))
694
})
695

    
696

    
697

    
698

    
699

    
700

    
701

    
702

    
703

    
704

    
705

    
706

    
707

    
708

    
709

    
710

    
711

    
712

    
713

    
714

    
715

    
716

    
717

    
718

    
719

    
720

    
721
fetch_mnase_replicates = function(# Prefetch data
722
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates.
723
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids.
724
roi, ##<< Region of interest.
725
all_samples, ##<< Global list of samples.
726
config=NULL, ##<< GLOBAL config variable
727
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking.
728
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence.
729
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files.
730
) {
731
	samples=list()
732
  samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id)
733
	for (i in samples_ids) {
734
		sample = as.list(all_samples[all_samples$id==i,])
735
    sample$orig_roi = roi
736
    sample$roi = translate_roi(roi, sample$strain, config = config)
737
		if (get_genome) {
738
			# Get Genome
739
      sample$roi$genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]], "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]		
740
		}
741
		# Get inputs
742
		sample$inputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), "table", stringsAsFactors=FALSE) 
743
		sample$total_reads = sum(sample$inputs[,4]) 	
744
		if (!only_fetch) {
745
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
746
	  }
747
	  # Get TF outputs for Mnase_Seq samples
748
		if (sample$marker == "Mnase_Seq" & get_ouputs) {	
749
			sample$outputs = get_content(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), "table", header=TRUE, sep="\t")
750
			if (!only_fetch) {
751
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
752
  		}
753
		}
754
		samples[[length(samples) + 1]] = sample		
755
	}
756
  return(samples)
757
}
758

    
759
substract_region = function(# Substract to a list of regions an other list of regions that intersect it.
760
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two.
761
region1, ##<< Original regions. 
762
region2 ##<< Regions to substract.
763
) {
764
  rec_substract_region = function(region1, region2) {
765
  non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) {
766
    cur_fuzzy = region1[i,]
767
    inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,]
768
    if (length(inter_wp[,1]) > 0) {
769
      ret = c()
770
      for (j in 1:length(inter_wp[,1])) {
771
        cur_wp = inter_wp[j,]
772
        if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
773
          # remove cur_fuzzy
774
          ret = c()
775
          break
776
        } else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
777
          # crop fuzzy 
778
          cur_fuzzy$lower_bound = cur_wp$upper_bound + 1 
779
          ret = cur_fuzzy
780
        } else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
781
          # crop fuzzy 
782
          cur_fuzzy$upper_bound = cur_wp$lower_bound - 1 
783
          ret = cur_fuzzy
784
        } else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
785
          # split fuzzy
786
          tmp_ret_fuzzy_1 = cur_fuzzy
787
          tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1 
788
          tmp_ret_fuzzy_2 = cur_fuzzy
789
          tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1 
790
          ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp)
791
          # print(ret)
792
          # ret = cur_fuzzy
793
          break
794
        } else {
795
          stop("WARNING NO ADAPTED CASE!")
796
        }          
797
      }
798
      return(ret)
799
    } else {
800
      return(cur_fuzzy)
801
    }
802
  })
803
  }
804
  non_inter_fuzzy = rec_substract_region(region1, region2)
805
  if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)}
806
  tmp_ulist = unlist(non_inter_fuzzy)
807
  tmp_names = names(tmp_ulist)[1:4]
808
  non_inter_fuzzy = data.frame(matrix(tmp_ulist, ncol=4, byrow=TRUE), stringsAsFactors=FALSE) 
809
  names(non_inter_fuzzy) = tmp_names
810
  non_inter_fuzzy$chr = as.character(non_inter_fuzzy$chr)     
811
  non_inter_fuzzy$chr = as.numeric(non_inter_fuzzy$chr)     
812
  non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound)
813
  non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound)
814
  non_inter_fuzzy = non_inter_fuzzy[order(non_inter_fuzzy$lower_bound),]      
815
  return(non_inter_fuzzy)
816
}
817

    
818
union_regions = function(# Aggregate regions that intersect themnselves.
819
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable.  
820
regions ##<< The Regions to be aggregated
821
) {
822
  old_length = length(regions[,1])
823
  new_length = 0
824
    
825
  while (old_length != new_length) {
826
    regions = regions[order(regions$lower_bound), ]
827
    regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 0, TRUE)      
828
    vec_end_1 = which(regions$stop)
829
    if (length(vec_end_1) == 0) {
830
      vec_end_1 = c(length(regions$stop))      
831
    } 
832
    if (vec_end_1[length(vec_end_1)] != length(regions$stop)) {
833
      vec_end_1 = c(vec_end_1, length(regions$stop))
834
    }
835
    vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1)
836
    union = apply(t(1:length(vec_beg_1)), 2, function(i) {
837
      chr = regions$chr[vec_beg_1[i]]
838
      lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]])
839
      upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]])
840
      roi_index = regions$roi_index[vec_beg_1[i]] 
841
      data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, roi_index=roi_index))
842
      })
843
    union = collapse_regions(union)
844
    old_length = length(regions[,1])
845
    new_length = length(union[,1])
846
    regions = union
847
  }
848
  return(union)
849
}
850

    
851
remove_aligned_wp = function(# Remove wp nucs from common nucs list.
852
### It is based on common wp nucs index on nucs and region.  
853
strain_maps, ##<< Nuc maps.
854
roi_index, ##<< The region of interest index.
855
tmp_common_nucs, ##<< the list of wp nucs.
856
strain##<< The strain to consider.
857
){
858
  fuzzy_nucs = strain_maps[[strain]]
859
  fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$roi_index == roi_index,]
860
  fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),]
861
  if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)}
862
  if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"}
863
  anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]
864
  fuzzy_nucs = fuzzy_nucs[-anti_index_1,]
865
  return(fuzzy_nucs)
866
}
867

    
868
translate_regions = function(# Translate a list of regions from a strain ref to another.
869
### This function is an eloborated call to translate_roi.
870
regions, ##<< Regions to be translated. 
871
combi, ##<< Combination of strains.
872
roi_index, ##<< The region of interest index.
873
config=NULL, ##<< GLOBAL config variable
874
roi ##<< The region of interest.
875
) {
876
  tr_regions = apply(t(1:length(regions[,1])), 2, function(i) {
877
    tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2])
878
    big_roi =  roi
879
    trs_tmp_regions_ref2 = translate_roi(tmp_regions_ref2, combi[1], config = config, big_roi = big_roi) 
880
    data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), roi_index=roi_index))
881
    })
882
  return(collapse_regions(tr_regions))
883
}
884

    
885
collapse_regions = function(# reformat an "apply  manipulated" list of regions
886
### Utils to reformat an "apply  manipulated" list of regions
887
regions ##< a list of regions
888
) {
889
  regions = do.call(rbind, regions)      
890
  regions$chr = as.character(regions$chr)     
891
  regions$chr = as.numeric(regions$chr)     
892
  regions$lower_bound = as.numeric(regions$lower_bound)
893
  regions$upper_bound = as.numeric(regions$upper_bound)
894
  regions = regions[order(regions$lower_bound),]      
895
  return(regions)  
896
}
897

    
898
extract_wp = function(# Extract wp nucs from nuc map. 
899
### Function based on common wp nuc index and roi_index.
900
strain_maps, ##<< Nuc maps.
901
roi_index, ##<< The region of interest index.
902
strain, ##<< The strain to consider.
903
tmp_common_nucs ##<< the list of wp nucs.
904
) {
905
  wp_nucs = apply(t(tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]), 2, function(i) {
906
    tmp_wp_nucs = strain_maps[[strain]]
907
    tmp_wp_nucs = tmp_wp_nucs[tmp_wp_nucs$roi_index == roi_index & tmp_wp_nucs$index_nuc == i,]
908
    return(tmp_wp_nucs)
909
    })
910
  return(collapse_regions(wp_nucs))
911
}
912

    
913

    
914

    
915

    
916

    
917
crop_fuzzy = function(# Crop bound of regions according to region of interest bound
918
### The fucntion is no more necessary since we remove "big_roi" bug in translate_roi function.
919
tmp_fuzzy_nucs, ##<< the regiuons to be croped.
920
roi, ##<< The region of interest.
921
strain, ##<< The strain to consider.
922
config=NULL ##<< GLOBAL config variable
923
) {
924
  tr_roi = translate_roi(roi, strain, config = config)
925
  tr_roi_begin = min(tr_roi$begin, tr_roi$end)
926
  tr_roi_end = max(tr_roi$begin, tr_roi$end)
927
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) {
928
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin
929
  }
930
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) {
931
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin
932
  }
933
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) {
934
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end
935
  }
936
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) {
937
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end
938
  }
939
  tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,]
940
  return(tmp_fuzzy_nucs)
941
}
942

    
943

    
944
get_fuzzy = function(# Compute the fuzzy nucs.
945
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting fuzzy regions. It will be take into account in the count read part og the pipeline.
946
combi, ##<< The strain combination to consider.
947
roi, ##<< The region of interest.
948
roi_index, ##<< The region of interest index.
949
strain_maps, ##<< Nuc maps.
950
common_nuc_results, ##<< Common wp nuc maps
951
config=NULL ##<< GLOBAL config variable
952
) {
953
  print(roi_index)
954
  PLOT = FALSE
955
  tmp_common_nucs = common_nuc_results[[paste(combi[1], combi[2], sep="_")]]
956
  tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$roi_index == roi_index, ]
957

    
958
  print(paste("Dealing with fuzzy from", combi[1]))
959
  tmp_fuzzy_nucs_1 = remove_aligned_wp(strain_maps, roi_index, tmp_common_nucs, combi[1])
960
  tmp_fuzzy_nucs_1 = crop_fuzzy(tmp_fuzzy_nucs_1, roi, combi[1], config)
961
  if (length(tmp_fuzzy_nucs_1[,1]) == 0) {return(NULL)}
962
  agg_fuzzy_1 = union_regions(tmp_fuzzy_nucs_1)    
963
  if (PLOT) for (i in 1:length(agg_fuzzy_1[,1])) {
964
    lines(c(agg_fuzzy_1[i,]$lower_bound, agg_fuzzy_1[i,]$upper_bound), c(+3.1,+3.1), col=2)
965
  }
966

    
967
  print(paste("Dealing with fuzzy from ", combi[2]))
968
  tmp_fuzzy_nucs_2 = remove_aligned_wp(strain_maps, roi_index, tmp_common_nucs, combi[2])
969
  if (length(tmp_fuzzy_nucs_2[,1]) == 0) {return(NULL)}
970
  agg_fuzzy_2 = union_regions(tmp_fuzzy_nucs_2)      
971
  agg_fuzzy_2 = crop_fuzzy(agg_fuzzy_2, roi, combi[2], config)
972
  tr_agg_fuzzy_2 = translate_regions(agg_fuzzy_2, combi, roi_index, roi=roi, config=config)
973
  tr_agg_fuzzy_2 = crop_fuzzy(tr_agg_fuzzy_2, roi, combi[2], config)
974
  # tr_agg_fuzzy_2 = union_regions(tr_agg_fuzzy_2)
975
  if (PLOT) for (i in 1:length(tr_agg_fuzzy_2[,1])) {
976
    lines(c(tr_agg_fuzzy_2[i,]$lower_bound, tr_agg_fuzzy_2[i,]$upper_bound), c(+3.3,+3.3), col=2)
977
  }
978
  
979
  print("Dealing with fuzzy from both...")
980
  all_fuzzy = union_regions(rbind(agg_fuzzy_1, tr_agg_fuzzy_2))
981
  if (PLOT) for (i in 1:length(all_fuzzy[,1])) {
982
    lines(c(all_fuzzy[i,]$lower_bound, all_fuzzy[i,]$upper_bound), c(+3.2, +3.2), col=1)
983
  }
984
  
985
  print(paste("Dealing with wp from", combi[1]))
986
  wp_nucs_1 = extract_wp(strain_maps, roi_index, combi[1], tmp_common_nucs)
987
  if (PLOT) for (i in 1:length(wp_nucs_1[,1])) {
988
    lines(c(wp_nucs_1[i,]$lower_bound, wp_nucs_1[i,]$upper_bound), c(+3.5,+3.5), col=3)
989
  }
990

    
991
  print(paste("Dealing with wp from", combi[2]))
992
  wp_nucs_2 = extract_wp(strain_maps, roi_index, combi[2], tmp_common_nucs)
993
  tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, roi_index, roi=roi, config=config)
994
  if (PLOT) for (i in 1:length(tr_wp_nucs_2[,1])) {
995
    lines(c(tr_wp_nucs_2[i,]$lower_bound, tr_wp_nucs_2[i,]$upper_bound), c(+3.7,+3.7), col=3)
996
  }
997

    
998
  print("Dealing with wp from both...")
999
  all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2))
1000
  if (PLOT) for (i in 1:length(all_wp[,1])) {
1001
    lines(c(all_wp[i,]$lower_bound, all_wp[i,]$upper_bound), c(+3.6, +3.6), col=1)
1002
  }  
1003

    
1004
  print("Dealing with fuzzy and wp...")
1005
  non_inter_fuzzy = substract_region(all_fuzzy, all_wp)
1006
  if (is.null(non_inter_fuzzy)) { return(NULL) }
1007
  
1008
  non_inter_fuzzy$len = non_inter_fuzzy$upper_bound - non_inter_fuzzy$lower_bound
1009
  # non_inter_fuzzy = non_inter_fuzzy[non_inter_fuzzy$len >= min_fuzz_width,]
1010
  if (PLOT) for (i in 1:length(non_inter_fuzzy[,1])) {
1011
    lines(c(non_inter_fuzzy[i,]$lower_bound, non_inter_fuzzy[i,]$upper_bound), c(+3.9, +3.9), col=1)
1012
  }
1013
  
1014
  non_inter_fuzzy$index_nuc = 1:length(non_inter_fuzzy[,1])
1015
  return (non_inter_fuzzy)
1016
}
1017

    
1018

    
1019

    
1020
get_all_reads = function(# Retrieve Reads 
1021
### Retrieve reads for a given marker, combi, form.
1022
marker, ##<< The marker to considere.
1023
combi, ##<< The starin combination to considere.
1024
form="wp", ##<< The nuc form to considere.
1025
config=NULL ##<< GLOBAL config variable
1026
) {
1027
	all_reads = NULL
1028
  for (manip in c("Mnase_Seq", marker)) {
1029
    if (form == "fuzzy") {
1030
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_fuzzy_and_nbreads.tab",sep="")
1031
  		tmp_res = read.table(file=out_filename, header=TRUE)				
1032
			tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,]
1033
      tmp_res$form = form
1034
    } else if (form == "wp") {
1035
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
1036
  		tmp_res = read.table(file=out_filename, header=TRUE)				
1037
      tmp_res$form = form
1038
    } else if (form == "wpfuzzy") {
1039
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
1040
  		tmp_res = read.table(file=out_filename, header=TRUE)				
1041
      tmp_res$form = "wp"
1042
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_fuzzy_and_nbreads.tab",sep="")
1043
  		tmp_res2 = read.table(file=out_filename, header=TRUE)				
1044
			tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,]
1045
      tmp_res2$form = "fuzzy"
1046
      tmp_res = rbind(tmp_res, tmp_res2)
1047
    }
1048
		if (is.null(all_reads)) {
1049
			all_reads = tmp_res[,c(1:9,length(tmp_res))]
1050
		}     
1051
		tmp_res = tmp_res[,-c(1:9,length(tmp_res))]
1052
		all_reads = cbind(all_reads, tmp_res)
1053
  }
1054
  return(all_reads)	
1055
}
1056

    
1057
get_design = function(# Build the design for deseq
1058
### This function build the design according sample properties.
1059
marker, ##<< The marker to considere.
1060
combi, ##<< The starin combination to considere.
1061
all_samples ##<< Global list of samples.
1062
) {
1063
  off1 = 0
1064
  off2 = 0
1065
	manips = c("Mnase_Seq", marker)
1066
	design_rownames = c()
1067
	design_manip = c()
1068
	design_strain = c()
1069
  off2index = function(off) {	
1070
  	switch(toString(off), 
1071
  		"1"=c(0,1,1),
1072
  	  "2"=c(1,0,1),
1073
    	"3"=c(1,1,0), 
1074
  		c(1,1,1)
1075
  		)	
1076
  }
1077
	for (manip in manips) {
1078
		tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) &  all_samples$marker == manip), ]
1079
		tmp_samples = tmp_samples[order(tmp_samples$strain), ]
1080
		if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) {
1081
			tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,]
1082
		} else {
1083
			if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) {
1084
				tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,]
1085
			}
1086
		}
1087
		design_manip = c(design_manip, rep(manip, length(tmp_samples$id)))				
1088
		for (strain in combi) {			
1089
			cols = apply(t(tmp_samples[ (tmp_samples$strain == strain &  tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")})
1090
			design_strain = c(design_strain, rep(strain, length(cols)))
1091
			design_rownames = c(design_rownames, cols)
1092
		}
1093
	}
1094
	snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain)
1095
	return(snep_design)	
1096
}
1097

    
1098
plot_dist_samples = function(# Plot the distribution of reads.
1099
### This fuxntion use the deseq nomalization feature to compare qualitatively the distribution.
1100
strain, ##<< The strain to considere.
1101
marker, ##<< The marker to considere.
1102
res, ##<< Data
1103
all_samples, ##<< Global list of samples.
1104
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot.
1105
) {			
1106
	cols = apply(t(all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")})
1107
	snepCountTable = res[,cols]
1108
	snepDesign = data.frame(
1109
		row.names = cols,
1110
		manip = rep(marker, length(cols)),
1111
		strain = rep(strain, length(cols))
1112
		)
1113
	cdsFull = newCountDataSet(snepCountTable, snepDesign)
1114
	sizeFactors = estimateSizeFactors(cdsFull)
1115
	# print(sizeFactors[[1]])
1116
	sample_ids = all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id
1117
	if (NEWPLOT) {
1118
		plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker))				
1119
		NEWPLOT = FALSE
1120
	}
1121
	for (it in 1:length(sample_ids)) {
1122
		sample_id = sample_ids[it]
1123
		lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it)
1124
	}
1125
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
1126
}
1127

    
1128
analyse_design = function(# Launch deseq methods.
1129
### This function is based on deseq example. It mormalizes data, fit data to GLM model with and without interaction term and compare the two l;=models.
1130
snep_design, ##<< The design to considere.
1131
reads ##<< The data to considere. 
1132
) {
1133
	snep_count_table = reads[, rownames(snep_design)]
1134
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1135
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1136
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1137

    
1138
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1139
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1140

    
1141
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1142
	return(list(fit1, fit0, snep_design, pvalsGLM))	
1143
}
1144

    
1145

    
1146

    
1147

    
1148

    
1149

    
1150

    
1151

    
1152
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1153
### This function uses 
1154
marker, ##<< The marker involved.
1155
combi, ##<< The strain combination involved.
1156
form, ##<< the nuc form involved. 
1157
all_samples, ##<< Global list of samples.
1158
config=NULL ##<< GLOBAL config variable
1159
) {
1160
  # PRETREAT
1161
  d = get_design(marker, combi, all_samples)
1162
  reads = get_all_reads(marker, combi, form, config=config)
1163
  # RUN ANALYSE
1164
  tmp_analyse = analyse_design(d, reads)
1165
  # RESULTS
1166
	fit1 = tmp_analyse[[1]]
1167
	fit0 = tmp_analyse[[2]]
1168
  k = names(fit1)
1169
  reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1170
  reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1171
  reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1172
	reads$pvalsGLM = signif(tmp_analyse[[4]], 5)
1173
	snep_design = tmp_analyse[[3]]
1174
  print(snep_design)
1175
	fdr = 0.0001
1176
	thres = FDR(reads$pvalsGLM, fdr) 
1177
	reads$snep_index = reads$pvalsGLM < thres
1178
	print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", fdr*100,"% of FDR.", sep = ""))
1179
  return(reads)
1180
  },  ex=function(){	
1181
    marker = "H3K4me1"
1182
    combi = c("BY", "YJM") 
1183
    form = "wpfuzzy" # "wp" | "fuzzy" | "wpfuzzy"
1184
    # foo = get_sneps(marker, combi, form)
1185
    # foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
1186
})
1187

    
1188

    
1189

    
1190

    
1191

    
1192

    
1193

    
1194

    
1195

    
1196

    
1197

    
1198

    
1199

    
1200

    
1201

    
1202

    
1203

    
1204

    
1205

    
1206

    
1207

    
1208

    
1209

    
1210

    
1211
ROM2ARAB = function(# Roman to Arabic pair list.
1212
### Util to convert Roman to Arabic
1213
){list(
1214
  "I" = 1,
1215
  "II" = 2,
1216
  "III" = 3,
1217
  "IV" = 4,
1218
  "V" = 5,
1219
  "VI" = 6,
1220
  "VII" = 7,
1221
  "VIII" = 8,
1222
  "IX" = 9,
1223
  "X" = 10,
1224
  "XI" = 11,
1225
  "XII" = 12,
1226
  "XIII" = 13,
1227
  "XIV" = 14,
1228
  "XV" = 15,
1229
  "XVI" = 16,
1230
  "XVII" = 17,
1231
  "XVIII" = 18,
1232
  "XIX" = 19,
1233
  "XX" = 20
1234
)}
1235

    
1236
switch_pairlist = structure(function(# Switch a pairlist
1237
### Take a pairlist key:value and return the switched pairlist value:key.
1238
l ##<< The pairlist to switch. 
1239
) {
1240
	ret = list()
1241
	for (name in names(l)) {
1242
		ret[[as.character(l[[name]])]] = name
1243
	}
1244
	ret
1245
### The switched pairlist.
1246
}, ex=function(){
1247
	l = list(key1 = "value1", key2 = "value2")
1248
	print(switch_pairlist(l))
1249
})
1250

    
1251
ARAB2ROM = function(# Arabic to Roman pair list.
1252
### Util to convert Arabicto Roman
1253
){switch_pairlist(ROM2ARAB())}
1254

    
1255

    
1256

    
1257

    
1258
translate_roi = structure(function(# Translate coords of a genome region.
1259
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1260
roi, ##<< Original genome region of interest.
1261
strain2, ##<< The strain in wich you want the genome region of interest.
1262
config=NULL, ##<< GLOBAL config variable
1263
big_roi=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1264
) {
1265
	strain1 = roi$strain_ref
1266
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1267
	if (strain1 == strain2) {
1268
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1	
1269
		return(roi)
1270
	}
1271
	# Launch c2c file
1272
	if (reverse) {
1273
		c2c_file = list(filename=config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]])
1274
	} else {
1275
		c2c_file = list(filename=config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]])
1276
	}
1277
	c2c = get_content(c2c_file$filename, "table", stringsAsFactors=FALSE)	
1278
	# filtering it
1279
  c2c = c2c[c2c$V6=="-",]
1280
	# Reverse
1281
	if (reverse) {
1282
		tmp_col = c2c$V1
1283
		c2c$V1 = c2c$V7
1284
		c2c$V7 = tmp_col
1285
		tmp_col = c2c$V2
1286
		c2c$V2 = c2c$V9
1287
		c2c$V9 = tmp_col
1288
		tmp_col = c2c$V3
1289
		c2c$V3 = c2c$V10
1290
		c2c$V10 = tmp_col
1291
	}
1292
	# Restrict c2c to big_roi
1293
	# if (FALSE) {
1294
	if (!is.null(big_roi)) {
1295
		if (roi$strain_ref != big_roi$strain_ref) {
1296
      # print("WARNING, big_roi and roi not in the same strain_ref... translating big_roi.")
1297
      big_roi = translate_roi(big_roi, roi$strain_ref, config=config)
1298
    }
1299
    if (big_roi$end < big_roi$begin) {
1300
      tmp_var = big_roi$begin
1301
      big_roi$begin = big_roi$end 
1302
      big_roi$end = tmp_var        
1303
      big_roi$length = big_roi$length        
1304
    }    
1305
    if (big_roi$chr!=roi$chr | roi$end > big_roi$end | roi$end < big_roi$begin | roi$begin > big_roi$end | roi$begin < big_roi$begin) {
1306
      print("WARNING! Trying to translate a roi not included in a big_roi.")
1307
      return(NULL)
1308
    }
1309
		if (strain1 == "BY") {
1310
			big_chro_1 = paste("chr", ARAB2ROM()[[big_roi$chr]], sep="")
1311
		} else if (strain1 == "RM") {			
1312
		  big_chro_1 = paste("supercontig_1.",big_roi$chr,sep="")
1313
		} else if (strain1 == "YJM") {			
1314
		  big_chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[big_roi$chr]]
1315
		}
1316
		big_begin_1 = big_roi$begin
1317
	  big_end_1 = big_roi$end
1318
		c2c = c2c[c2c$V1==big_chro_1,]
1319
    if (length(c2c[c2c$V3 < big_begin_1 & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V3 < big_begin_1 & c2c$V2 < c2c$V3,c("V2", "V3") ] = big_begin_1}
1320
    if (length(c2c[c2c$V2 > big_end_1   & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V2 > big_end_1   & c2c$V2 < c2c$V3, c("V2", "V3")] = big_end_1}
1321
    if (length(c2c[c2c$V2 < big_begin_1 & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V2 < big_begin_1 & c2c$V3 < c2c$V2,c("V2", "V3") ] = big_begin_1}
1322
    if (length(c2c[c2c$V3 > big_end_1   & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V3 > big_end_1   & c2c$V3 < c2c$V2, c("V2", "V3")] = big_end_1}
1323
    c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1324
	}
1325
  #	Convert initial roi$chr into c2c format
1326
	if (strain1 == "BY") {
1327
		chro_1 = paste("chr", ARAB2ROM()[[roi$chr]], sep="")
1328
	} else if (strain1 == "RM") {			
1329
	  chro_1 = paste("supercontig_1.",roi$chr,sep="")
1330
	} else if (strain1 == "YJM") {			
1331
	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[roi$chr]]
1332
	}
1333
	begin_1 = roi$begin
1334
  end_1 = roi$end
1335
  # Computing equivalent strain_2 alignment coordinates	
1336
	if (reverse) {
1337
  	tmptransfostart = c2c[c2c$V1==chro_1 & ((c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1)),]
1338
    tmptransfostop = c2c[c2c$V1==chro_1 &  ((c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1)),]
1339
	} else {
1340
		tmptransfostart = c2c[c2c$V1==chro_1 & c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1341
	  tmptransfostop = c2c[c2c$V1==chro_1 & c2c$V3>=end_1 & c2c$V2<=end_1,]
1342
	}
1343
	# Never happend conditions ...	
1344
	{
1345
		if (length(tmptransfostart$V8) == 0) {
1346
			# begin_1 is between to lines: shift begin_1 to the start of 2nd line.
1347
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1348
  			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V2>=begin_1,]
1349
  			begin_1 = min(tmp_c2c$V2)
1350
      } else {
1351
  			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V3>=begin_1,]
1352
  			begin_1 = min(tmp_c2c$V3)
1353
      }
1354
			if (reverse) {
1355
		  	tmptransfostart = c2c[c2c$V1==chro_1 & ((c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1)),]
1356
			} else {
1357
				tmptransfostart = c2c[c2c$V1==chro_1 & c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1358
			}
1359
			if (length(tmptransfostart$V8) == 0) {
1360
				if (!is.null(big_roi)) {
1361
					return(NULL)
1362
					tmptransfostart = c2c[c2c$V1==chro_1 & c2c$V3>=big_roi$begin & c2c$V2<=big_roi$begin,]
1363
				} else { 
1364
					# return(NULL)
1365
					# print(c2c[c2c$V1==chro_1 & c2c$V2<=end_1 & c2c$V3>=begin_1,])
1366
					# print(c2c[c2c$V1==chro_1,])
1367
					print(tmptransfostart)
1368
					print(tmptransfostop)
1369
					stop("Never happend condition 1.")					
1370
				}
1371
			}
1372
		} 
1373
		if (length(tmptransfostop$V8) == 0) {
1374
			# end_1 is between to lines: shift end_1 to the end of 2nd line.
1375
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1376
  			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V3<=end_1,]
1377
  			end_1 = max(tmp_c2c$V3)
1378
      } else {
1379
  			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V2<=end_1,]
1380
  			end_1 = max(tmp_c2c$V2)
1381
      }
1382
			if (reverse) {
1383
		    tmptransfostop = c2c[c2c$V1==chro_1 &  ((c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1)),]
1384
			} else {
1385
			  tmptransfostop = c2c[c2c$V1==chro_1 & c2c$V3>=end_1 & c2c$V2<=end_1,]
1386
			}
1387
			if (length(tmptransfostop$V8) == 0) {
1388
				if (!is.null(big_roi)) {
1389
					return(NULL)
1390
				  tmptransfostop = c2c[c2c$V1==chro_1 & c2c$V3>=big_roi$end & c2c$V2<=big_roi$end,]
1391
				} else {
1392
					# return(NULL)
1393
					print(c2c[c2c$V1==chro_1,])
1394
					print(tmptransfostart)
1395
					print(tmptransfostop)
1396
					stop("Never happend condition 2.")
1397
				} 
1398
			} 
1399
		}
1400
		if (length(tmptransfostart$V8) != 1) {
1401
			# tmptransfostart = tmptransfostart[1,]
1402
			# print("many start")
1403
			# print(c2c[c2c$V1==chro_1,])
1404
			tmptransfostart = tmptransfostart[tmptransfostart$V1==chro_1 & tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,]
1405
			if (length(tmptransfostart$V8) != 1) {
1406
				# return(NULL)
1407
				print(tmptransfostart)
1408
				print(tmptransfostop)
1409
  			stop("Never happend condition 3.")
1410
			}
1411
		}
1412
		if (length(tmptransfostop$V8) != 1) {
1413
			# tmptransfostop = tmptransfostop[length(tmptransfostop$V8),]
1414
			# print("many stop")
1415
			# print(tmptransfostop)
1416
			# print(roi)
1417
		  tmptransfostop = tmptransfostop[tmptransfostop$V1==chro_1 & tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,]
1418
			if (length(tmptransfostop$V8) != 1) {
1419
				# return(NULL)
1420
				print(tmptransfostart)
1421
				print(tmptransfostop)
1422
  			stop("Never happend condition 4.")
1423
			}
1424
		}		
1425
		if (tmptransfostart$V7 != tmptransfostop$V7) {
1426
			print(tmptransfostart)
1427
			print(tmptransfostop)		
1428
 			stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.")
1429
		}
1430
	} 
1431
  # Deal with strand
1432
  if (tmptransfostart$V8 == 1) {
1433
    begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2)
1434
    end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2)
1435
  } else {
1436
    begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2)
1437
    end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2)
1438
  }
1439
	# Build returned roi
1440
	roi$strain_ref = strain2
1441
	if (roi$strain_ref == "BY") {
1442
		roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]]
1443
	} else {
1444
		roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]]
1445
	}
1446
  roi$begin = begin_2
1447
  roi$end = end_2
1448
	if (sign(roi$end - roi$begin) == 0) {
1449
		roi$length = 1			
1450
	} else {		
1451
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1	
1452
	}
1453
  return(roi)  
1454
}, ex=function(){
1455
	# Define new translate_roi function...
1456
	translate_roi = function(roi, strain2, config) {
1457
		strain1 = roi$strain_ref
1458
		if (strain1 == strain2) {
1459
			return(roi)
1460
		} else {
1461
		  stop("Here is my new translate_roi function...")		
1462
		}	
1463
	}
1464
	# Binding it by uncomment follwing lines.
1465
	# unlockBinding("translate_roi", as.environment("package:nm"))
1466
	# unlockBinding("translate_roi", getNamespace("nm"))
1467
	# assign("translate_roi", translate_roi, "package:nm")
1468
	# assign("translate_roi", translate_roi, getNamespace("nm"))
1469
	# lockBinding("translate_roi", getNamespace("nm"))
1470
	# lockBinding("translate_roi", as.environment("package:nm"))	
1471
})
1472

    
1473

    
1474

    
1475

    
1476

    
1477

    
1478

    
1479
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
1480
### CURs are regions that can be aligned between the genomes 
1481
diff_allowed = 10, ##<< the maximum indel width allowe din a CUR
1482
min_cur_width = 200, ##<< The minimum width of a CUR
1483
config = NULL, ##<< GLOBAL config variable
1484
plot = FALSE ##<< Plot CURs or not
1485
) {
1486
  get_inter_strain_rois = function(strain1, strain2, diff_allowed = 10, min_cur_width = 200, plot=FALSE) {
1487
  	c2c = get_content(config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]], "table", stringsAsFactors=FALSE)
1488
    # Filtering unagapped
1489
    c2c = c2c[c2c$V6=="-",]
1490
    # filtering some things (chr...)
1491
    # c2c = c2c[c2c$V1 == "chrIV",]
1492
    diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)] 
1493
    diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)] 
1494
  	# Plot diffs to define a threshold (diff_allowed)
1495
  	# hist(abs(c(diff2, diff)),breaks=c(0:2000, 200000000000), xlim=c(0,100))
1496
    # Filtering
1497
  	indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed)
1498
  	indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1))
1499

    
1500
    rois = NULL	
1501
  	for(i in 1:length(indexes_start)) {
1502
  		start = indexes_start[i]
1503
  		stop = indexes_stop[i]
1504
  		sub_c2c = c2c[start:stop,]
1505
  		if (strain1 == "BY") {
1506
  			chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]]
1507
  		} else {			
1508
  			chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]]
1509
  		}
1510
  		roi = list(chr=chr, begin=sub_c2c[1,]$V2, end=sub_c2c[length(sub_c2c$V1),]$V3, strain_ref=strain1)
1511
  		roi[["length"]] = roi$end - roi$begin
1512
  		if (roi$length >= min_cur_width) {
1513
        rois = dfadd(rois,roi)
1514
  	  }
1515
  		if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) {
1516
  			print("*************** ERROR, non homogenous region! ********************")
1517
  		}
1518
  		# print(i)
1519
  		# print(roi)
1520
  		# print(sub_c2c)
1521
  		# print("________________________________________________________________")
1522
  	}
1523
  	if (plot) {
1524
  		print(paste(length(indexes_stop), "area of interest."))
1525
  	  # Plot rois
1526
  	  genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[[strain1]], "fasta")   
1527
  		plot(0,0, ylim=(c(1,length(genome))), xlim = c(0, max(apply(t(genome), 2, function(chr){length(unlist(chr))}))))
1528
  		for (name in names(genome)) {
1529
  			if (strain1 == "BY") {
1530
  				chr_ref = paste("chr", ARAB2ROM()[[config$FASTA_INDEXES[[strain1]][[name]]]], sep="")
1531
  			} else {			
1532
  				chr_ref = name
1533
  			}
1534
  			y_lev = as.integer(config$FASTA_INDEXES[[strain1]][[name]])
1535
  			lines(c(0,length(unlist(genome[[name]]))), c(y_lev,y_lev))
1536
  			text( length(unlist(genome[[name]]))/2, y_lev, labels = chr_ref)	
1537
  		}
1538
  	  col=1
1539
  	  for (roi_index in 1:length(rois$chr)) {
1540
  			roi = rois[roi_index,]	
1541
  			y_lev = as.integer(roi$chr) + 0.3
1542
  			lines(c(roi$begin,roi$end), c(y_lev,y_lev), col=col)
1543
  			text( mean(c(roi$begin,roi$end)), y_lev, labels = roi_index)
1544
  	  	col = col + 1
1545
  	  }	
1546
  	}
1547
  	return(rois)
1548
  }
1549
	rois = NULL	
1550
	rois_BY_RM = get_inter_strain_rois("BY", "RM", min_cur_width = min_cur_width, diff_allowed = diff_allowed)
1551
	rois_BY_YJM = get_inter_strain_rois("BY", "YJM", min_cur_width = min_cur_width, diff_allowed = diff_allowed)
1552
	for (roi_1_index in 1:length(rois_BY_RM[,1])) {
1553
		roi_1 = rois_BY_RM[roi_1_index,]
1554
		roi_2_candidates = rois_BY_YJM[rois_BY_YJM$chr== roi_1$chr & rois_BY_YJM$begin <= roi_1$end & rois_BY_YJM$end >= roi_1$begin , ] ;  
1555
		# print(length(roi_2_candidates[,1]))
1556
		if (length(roi_2_candidates[,1]) > 0) {
1557
			for(roi_2_index in 1:length(roi_2_candidates[,1])) {
1558
				roi_2 = roi_2_candidates[roi_2_index,]
1559
				roi = list(chr=roi_1$chr, begin=max(roi_1$begin, roi_2$begin), end=min(roi_1$end, roi_2$end), strain_ref="BY")
1560
				roi[["length"]] = roi$end - roi$begin + 1
1561
				if (roi$length >= min_cur_width) {
1562
					# if (length(rois[,1]) == 153) {
1563
					# 	print(paste(length(rois[,1]), roi_1_index, roi_2_index ))
1564
					# 	print(roi_1)
1565
					# 	print(roi_2)
1566
					# 	print(roi)
1567
					# }
1568
			    rois = dfadd(rois,roi)
1569
			  }			
1570
			}
1571
		}
1572
	}
1573
	print(length(rois[,1]))
1574
	print(sum(rois$length))
1575
	rois_1st_round = rois
1576
	rois_2nd_round = NULL	
1577
	rois_RM_YJM = get_inter_strain_rois("RM", "YJM", min_cur_width = min_cur_width, diff_allowed = diff_allowed)
1578
	for (roi_1_index in 1:length(rois_1st_round[,1])) {
1579
		roi_1 = rois_1st_round[roi_1_index,]
1580
		translated_roi_1 = translate_roi(roi_1, "RM", config = config)
1581
		t_b = min(translated_roi_1$begin, translated_roi_1$end)
1582
		t_e = max(translated_roi_1$begin, translated_roi_1$end)
1583
		roi_2_candidates = rois_RM_YJM[rois_RM_YJM$chr== translated_roi_1$chr & rois_RM_YJM$begin <= t_e & rois_RM_YJM$end >= t_b , ] ;  
1584
		if (length(roi_2_candidates[,1]) > 0) {
1585
			for(roi_2_index in 1:length(roi_2_candidates[,1])) {
1586
				roi_2 = roi_2_candidates[roi_2_index,]
1587
				roi = list(chr=translated_roi_1$chr, begin=max(t_b, roi_2$begin), end=min(t_e, roi_2$end), strain_ref="RM")
1588
				roi[["length"]] = roi$end - roi$begin + 1
1589
				if (roi$length >= min_cur_width) {
1590
			    rois_2nd_round = dfadd(rois_2nd_round,roi)
1591
			  }			
1592
			}
1593
		}
1594
	}
1595
	print(length(rois_2nd_round[,1]))
1596
	print(sum(rois_2nd_round$length))
1597
	rois = rois_2nd_round
1598

    
1599
	rois_translator_round = list()	
1600
	for (roi_index in 1:length(rois[,1])) {
1601
		roi = rois[roi_index,]
1602
		BY_roi  = translate_roi(roi, "BY", config = config)
1603
		tmp_BY_roi = BY_roi 
1604
		BY_roi$begin = min(tmp_BY_roi$begin, tmp_BY_roi$end)
1605
		BY_roi$end = max(tmp_BY_roi$begin, tmp_BY_roi$end)
1606
		BY_roi$length = abs(BY_roi$length)
1607
		rois_translator_round = dfadd(rois_translator_round, BY_roi)
1608
	}
1609
	rois = rois_translator_round
1610

    
1611
	rois_3rd_round = NULL	
1612
	for (roi_index in 1:length(rois[,1])) {
1613
	# for (roi_index in 1:2) {
1614
		current_roi = rois[roi_index,]
1615
		# print(roi_index)
1616
	
1617
	  to_be_check_rois = dfadd(NULL, current_roi)
1618
		NEED_RERUN = TRUE
1619
		while (NEED_RERUN) {
1620
			# print("RERUN"),
1621
			NEED_RERUN = FALSE
1622
			to_be_check_again = NULL
1623
			for (to_be_check_roi_index in 1:length(to_be_check_rois[,1])) {
1624
				# print(to_be_check_rois)
1625
				to_be_check_roi = to_be_check_rois[to_be_check_roi_index,]
1626
		    combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM"), c("RM", "BY"), c("YJM", "BY"), c("YJM", "RM"))
1627
			  for (combi in combis) {
1628
					# print(combi)
1629
			    strain1 = combi[1]
1630
		      strain2 = combi[2]
1631
					trans_roi = translate_roi(to_be_check_roi, strain1, config = config)
1632
					lower_bound=min(trans_roi$begin, trans_roi$end)
1633
					upper_bound=max(trans_roi$begin, trans_roi$end)			
1634
          
1635
          check_overlaping = structure(function(strain1 = "BY", strain2 = "RM", chr = NULL, lower_bound=NULL, upper_bound=NULL) {
1636
            reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1637
          	if (strain1 == strain2) {
1638
          		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1	
1639
          		return(roi)
1640
          	}
1641
          	# Launch c2c file
1642
          	if (reverse) {
1643
          		c2c_file = list(filename=config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]])
1644
          	} else {
1645
          		c2c_file = list(filename=config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]])
1646
          	}
1647
          	c2c = get_content(c2c_file$filename, "table", stringsAsFactors=FALSE)	
1648
          	# filtering it
1649
            c2c = c2c[c2c$V6=="-",]
1650
          	# Reverse
1651
          	if (reverse) {
1652
          		tmp_col = c2c$V1
1653
          		c2c$V1 = c2c$V7
1654
          		c2c$V7 = tmp_col
1655
          		tmp_col = c2c$V2
1656
          		c2c$V2 = c2c$V9
1657
          		c2c$V9 = tmp_col
1658
          		tmp_col = c2c$V3
1659
          		c2c$V3 = c2c$V10
1660
          		c2c$V10 = tmp_col
1661
          	}
1662

    
1663
          	if (strain1 == "BY") {
1664
          		chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="")
1665
          	} else if (strain1 == "RM") {			
1666
          	  chro_1 = paste("supercontig_1.",chr,sep="")
1667
          	} else if (strain1 == "YJM") {			
1668
          	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]]
1669
          	}
1670
          	# print(chro_1)
1671
          	if (!is.null(lower_bound) & !is.null(upper_bound)) {
1672
              if (reverse) {
1673
          	  	tmp_c2c = c2c[c2c$V1==chro_1 & ((c2c$V3>=lower_bound & c2c$V2<=upper_bound & c2c$V8==1) | (c2c$V2>=lower_bound & c2c$V3<=upper_bound & c2c$V8==-1)),]
1674
          		} else {
1675
          			tmp_c2c = c2c[c2c$V1==chro_1 & c2c$V3>=lower_bound & c2c$V2<=upper_bound,]
1676
          		}
1677
          	} else {
1678
            	tmp_c2c = c2c[c2c$V1 == chr,]
1679
          	}
1680
          	if (length(tmp_c2c[,1]) > 1) {
1681
          		pbs = apply(t(1:(length(tmp_c2c[,1]) - 1)), 2, function(i){
1682
          			# print(paste(i, "/", length(tmp_c2c[,1])))
1683
          			apply(t((i+1):length(tmp_c2c[,1])), 2, function(j){
1684
          				l1 = tmp_c2c[i,] 
1685
          				b1 = min(l1$V2, l1$V3)
1686
          				e1 = max(l1$V2, l1$V3)
1687
          				l2 = tmp_c2c[j,]
1688
          				b2 = min(l2$V2, l2$V3)
1689
          				e2 = max(l2$V2, l2$V3)
1690
          				if ((e1>=b2 & b1<=e2) | (e2>=b1 & b2<=e1)) {
1691
          					print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ",chr, " [", b1, ",", e1, "] [", b2, ",", e2, "]", sep=""))				
1692
          					pb = list(strain1, strain2, chr, b1, e1, b2, e2)					
1693
          					pb
1694
          				} else {
1695
          					NULL
1696
          				}
1697
          			})
1698
          		})
1699
          		return(pbs)
1700
          	}
1701

    
1702
          }, ex=function(){
1703
          	source("src/nucleo_miner/yeast_strain_conversion.R"); 
1704
          	pbs1 = check_overlaping(strain1 = "BY", strain2 = "RM", dest=TRUE)
1705
          	pbs3 = check_overlaping(strain1 = "BY", strain2 = "YJM", dest=TRUE)
1706
          	pbs5 = check_overlaping(strain1 = "RM", strain2 = "YJM", dest=TRUE)
1707
          	pbs2 = check_overlaping(strain1 = "BY", strain2 = "RM", dest=FALSE)
1708
          	pbs4 = check_overlaping(strain1 = "BY", strain2 = "YJM", dest=FALSE)
1709
          	pbs6 = check_overlaping(strain1 = "RM", strain2 = "YJM", dest=FALSE)
1710
          })
1711
          
1712

    
1713
					res = check_overlaping(strain1 = strain1, strain2 = strain2, chr = trans_roi$chr, lower_bound=lower_bound, upper_bound=upper_bound) 
1714
					if (!is.null(res)) {
1715
						df_res = data.frame(matrix(unlist(res), ncol = 7, byrow=TRUE), stringsAsFactors=FALSE)		
1716
						interval = df_res[1,]
1717
						inter_min = as.numeric(max( min(interval$X4, interval$X5), min(interval$X6, interval$X7)))
1718
						inter_max = as.numeric(min( max(interval$X4, interval$X5), max(interval$X6, interval$X7)))
1719
						# print(paste("SPLIT ROI", roi_index, "for", combi[1], combi[2]))			
1720
						new_roi1 = trans_roi
1721
						new_roi2 = trans_roi
1722
						new_roi1$begin = lower_bound
1723
						new_roi1$end = inter_min - 1
1724
						new_roi1$length = new_roi1$end - new_roi1$begin + 1
1725
						new_roi2$begin = inter_max + 1
1726
						new_roi2$end = upper_bound
1727
						new_roi2$length = new_roi2$end - new_roi2$begin + 1
1728
						if (new_roi1$length > min_cur_width) {
1729
							BY_roi  = translate_roi(new_roi1, "BY", config = config)
1730
							tmp_BY_roi = BY_roi
1731
							BY_roi$begin = min(tmp_BY_roi$begin, tmp_BY_roi$end)
1732
							BY_roi$end = max(tmp_BY_roi$begin, tmp_BY_roi$end)
1733
							BY_roi$length = abs(BY_roi$length)
1734
							to_be_check_again = dfadd(to_be_check_again, BY_roi)	
1735
						}
1736
						if (new_roi2$length > min_cur_width) {
1737
							BY_roi  = translate_roi(new_roi2, "BY", config = config)
1738
							tmp_BY_roi = BY_roi
1739
							BY_roi$begin = min(tmp_BY_roi$begin, tmp_BY_roi$end)
1740
							BY_roi$end = max(tmp_BY_roi$begin, tmp_BY_roi$end)
1741
							BY_roi$length = abs(BY_roi$length)
1742
							to_be_check_again = dfadd(to_be_check_again, BY_roi)	
1743
						}
1744
						if (to_be_check_roi_index < length(to_be_check_rois[,1])) {
1745
							for (i in (to_be_check_roi_index + 1):length(to_be_check_rois[,1])) {
1746
								to_be_check_again = dfadd(to_be_check_again, to_be_check_rois[i,])								
1747
							}
1748
						}
1749
						NEED_RERUN = TRUE
1750
						break
1751
					}
1752
				}
1753
				if (NEED_RERUN) {
1754
					to_be_check_rois = to_be_check_again
1755
					break
1756
				}
1757
			}
1758
		}
1759
		
1760
		checked_rois = to_be_check_rois
1761
		for (checked_roi_index in 1:length(checked_rois[,1])) {
1762
			rois_3rd_round = dfadd(rois_3rd_round, checked_rois[checked_roi_index,])	
1763
		}
1764
	}
1765

    
1766

    
1767
	print(length(rois_3rd_round[,1]))
1768
	print(sum(rois_3rd_round$length))
1769
	rois = rois_3rd_round
1770
	
1771
	
1772
	if (plot) {
1773
		print(paste(length(rois$chr), "area of interest."))
1774
	  # Plot rois
1775
	  genome = get_content(config$FASTA_REFERENCE_GENOME_FILES[["BY"]], "fasta")   
1776
		plot(0,0, ylim=(c(1,length(genome))), xlim = c(0, max(apply(t(genome), 2, function(chr){length(unlist(chr))}))))
1777
		for (name in names(genome)) {
1778
			if (TRUE) {
1779
				chr_ref = paste("chr", ARAB2ROM()[[config$FASTA_INDEXES[["BY"]][[name]]]], sep="")
1780
			} else {			
1781
				chr_ref = name
1782
			}
1783
			y_lev = as.integer(config$FASTA_INDEXES[["BY"]][[name]])
1784
			lines(c(0,length(unlist(genome[[name]]))), c(y_lev,y_lev))
1785
			text( length(unlist(genome[[name]]))/2, y_lev, labels = chr_ref)	
1786
		}
1787
	  col=1
1788
	  for (roi_index in 1:length(rois$chr)) {
1789
			roi = rois[roi_index,]	
1790
			y_lev = as.integer(roi$chr) + 0.3
1791
			lines(c(roi$begin,roi$end), c(y_lev,y_lev), col=col)
1792
			text( mean(c(roi$begin,roi$end)), y_lev, labels = roi_index)
1793
	  	col = col + 1
1794
	  }	
1795
	}
1796
	return (rois)
1797
}
1798

    
1799

    
1800

    
1801

    
1802
build_replicates = structure(function(# Stage replicates data 
1803
### This function loads in memory data corresponding to the given experiments.
1804
expe, ##<< a list of vector corresponding to vector of replicates.
1805
roi, ##<< the region that we are interested in.
1806
only_fetch=FALSE, ##<< filter or not inputs.
1807
get_genome=FALSE,##<< Load or not corresponding genome.
1808
all_samples, ##<< Global list of samples.
1809
config=NULL ##<< GLOBAL config variable.
1810
) {
1811
  build_samples = function(samples_ids, roi, only_fetch=FALSE, get_genome=TRUE, get_ouputs=TRUE, all_samples) {
1812
  	samples=list()
1813
  	for (i in samples_ids) {
1814
  		sample = as.list(all_samples[all_samples$id==i,])
1815
      sample$orig_roi = roi
1816
      sample$roi = translate_roi(roi, sample$strain, config = config)
1817
  		if (get_genome) {
1818
  			# Get Genome
1819
  			fasta_ref_filename = config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]]
1820
  			sample$roi$genome = get_content(fasta_ref_filename, "fasta")[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]		
1821
  		}
1822
  		# Get inputs
1823
  		sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1824
  		sample$inputs = get_content(sample_inputs_filename, "table", stringsAsFactors=FALSE) 
1825
  		sample$total_reads = sum(sample$inputs[,4]) 	
1826
  		if (!only_fetch) {
1827
  		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300, filter_for_coverage=TRUE)
1828
  	  }
1829
  	  # Get TF outputs for Mnase_Seq samples
1830
  		if (sample$marker == "Mnase_Seq" & get_ouputs) {	
1831
  			sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")
1832
  			sample$outputs = get_content(sample_outputs_filename, "table", header=TRUE, sep="\t")
1833
  			if (!only_fetch) {
1834
  	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1835
    		}
1836
  		}
1837
  		samples[[length(samples) + 1]] = sample		
1838
  	}
1839
  	return(samples)
1840
  }
1841
	replicates = list()
1842
	for(samples_ids in expe) {
1843
		samples = build_samples(samples_ids, roi, only_fetch=only_fetch, get_genome=get_genome, all_samples=all_samples)
1844
		replicates[[length(replicates) + 1]] = samples
1845
	}
1846
	return(replicates)
1847
  }, ex = function() {
1848
    # library(rjson)
1849
    # library(nucleominer)
1850
    # 
1851
    # # Read config file
1852
    # json_conf_file = "nucleo_miner_config.json"
1853
    # config = fromJSON(paste(readLines(json_conf_file), collapse=""))
1854
    # # Read sample file
1855
    # all_samples = get_content(config$CSV_SAMPLE_FILE, "cvs", sep=";", head=TRUE, stringsAsFactors=FALSE)  
1856
    # # here are the sample ids in a list
1857
    # expes = list(c(1))
1858
    # # here is the region that we wnt to see the coverage
1859
    # cur = list(chr="8", begin=472000, end=474000, strain_ref="BY") 
1860
    # # it displays the corverage
1861
    # replicates = build_replicates(expes, cur, all_samples=all_samples, config=config)
1862
    # out = watch_samples(replicates, config$READ_LENGTH, 
1863
    #       plot_coverage = TRUE,  
1864
    #       plot_squared_reads = FALSE,  
1865
    #       plot_ref_genome = FALSE, 
1866
    #       plot_arrow_raw_reads = FALSE,  
1867
    #       plot_arrow_nuc_reads = FALSE,  
1868
    #       plot_gaussian_reads = FALSE,  
1869
    #       plot_gaussian_unified_reads = FALSE,  
1870
    #       plot_ellipse_nucs = FALSE,  
1871
    #       plot_wp_nucs = FALSE,  
1872
    #       plot_wp_nuc_model = FALSE,  
1873
    #       plot_common_nucs = FALSE,  
1874
    #       height = 50)
1875
  })
1876

    
1877

    
1878

    
1879

    
1880

    
1881

    
1882

    
1883

    
1884

    
1885

    
1886

    
1887

    
1888

    
1889

    
1890

    
1891

    
1892

    
1893

    
1894

    
1895

    
1896

    
1897

    
1898

    
1899

    
1900

    
1901

    
1902

    
1903

    
1904

    
1905

    
1906

    
1907

    
1908

    
1909

    
1910

    
1911

    
1912

    
1913

    
1914

    
1915

    
1916

    
1917

    
1918

    
1919

    
1920

    
1921

    
1922

    
1923

    
1924

    
1925

    
1926

    
1927

    
1928

    
1929

    
1930

    
1931

    
1932

    
1933

    
1934

    
1935

    
1936

    
1937

    
1938

    
1939

    
1940

    
1941

    
1942

    
1943

    
1944

    
1945

    
1946

    
1947

    
1948

    
1949

    
1950

    
1951

    
1952

    
1953

    
1954

    
1955

    
1956

    
1957

    
1958

    
1959

    
1960

    
1961

    
1962

    
1963

    
1964

    
1965

    
1966

    
1967

    
1968

    
1969

    
1970

    
1971

    
1972

    
1973

    
1974

    
1975

    
1976

    
1977
perform_anovas = function(# Performaing ANOVAs
1978
### Counts reads and Performs ANOVAS for each common nucleosomes involved.
1979
replicates, ##<< Set of replicates, each replicate is a list of samples (ideally 3). Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}. In the \emph{perform_anovas} contexte, we need 4 replicates (4 * (3 samples)): 2 strains * (1 marker + 1 input (Mnase_Seq)). 
1980
aligned_inter_strain_nucs, ##<< List of common nucleosomes.
1981
inputs_name="Mnase_Seq", ##<< Name of the input.
1982
plot_anova_boxes=FALSE ##<< Plot (or not) boxplot for each nuc.
1983
) {
1984
	anova_results = NULL
1985
	for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
1986
		inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
1987
	  # counting reads
1988
		my_data = NULL
1989
		for (replicate_rank in 1:length(replicates)) {
1990
			samples = replicates[[replicate_rank]]
1991
			strain = samples[[1]]$strain
1992
			marker = samples[[1]]$marker
1993
			for (sample_rank in 1:length(samples)) {
1994
				sample = samples[[sample_rank]]
1995
				nuc_width = as.integer(inter_strain_nuc[[paste("upper_bound_",strain,sep="")]]) - inter_strain_nuc[[paste("lower_bound_",strain,sep="")]]
1996
				nuc_reads = filter_tf_inputs(sample$inputs, inter_strain_nuc[[paste("chr_",strain,sep="")]], inter_strain_nuc[[paste("lower_bound_",strain,sep="")]], inter_strain_nuc[[paste("upper_bound_",strain,sep="")]], nuc_width - 1)
1997
				indic = sum(nuc_reads[,4]) * 1000000/sample$total_reads / nuc_width 
1998
				my_data = dfadd(my_data, list(strain = strain, marker = marker, indic = indic))
1999
			}
2000
		}
2001
		my_data$strain = as.factor(my_data$strain)
2002
		my_data$marker = as.factor(my_data$marker)
2003
	
2004
		strain_ref1 = replicates[[1]][[1]]$strain
2005
		strain_ref2 = replicates[[2]][[1]]$strain 
2006
		marker = replicates[[length(replicates)]][[1]]$marker				
2007

    
2008
	  # Collecting anova results
2009
	  anova_result = list()
2010
		# nucs info
2011
		for (name in names(inter_strain_nuc)) {
2012
			anova_result[[name]] = inter_strain_nuc[[name]]
2013
		}
2014
	  #
2015
		for (strain in c(strain_ref1, strain_ref2)) {
2016
			for (manip in c(inputs_name, marker)) {
2017
				replicat = 1
2018
				for(indic in my_data[my_data$strain == strain & my_data$marker == manip,]$indic) {
2019
					anova_result[[paste(strain, manip, replicat, sep="_")]] = indic
2020
					replicat = replicat + 1
2021
				}			
2022
			}	
2023
		}
2024

    
2025
	  # Compute ANOVAs
2026
		mnase_data = my_data[my_data$marker == inputs_name,]
2027
		mnase_aov = aov(indic~strain,mnase_data)
2028
		mnase_effects = model.tables(mnase_aov)$tables
2029
		mnase_pvalues = summary(mnase_aov)[[1]]$Pr
2030
		# boxplot(indic~ma*st,mnase_data)
2031
		anova_result[["mnase_st"]] = mnase_effects$strain[1]
2032
		anova_result[["mnase_st_pvalue"]] = mnase_pvalues[1]	
2033
		# 
2034
		marker_data = my_data[my_data$marker == marker,]
2035
		marker_aov = aov(indic~strain,marker_data)
2036
		marker_effects = model.tables(marker_aov)$tables
2037
		marker_pvalues = summary(marker_aov)[[1]]$Pr
2038
		# boxplot(indic~ma*st,marker_data)
2039
		anova_result[["marker_st"]] = marker_effects$strain[1]
2040
		anova_result[["marker_st_pvalue"]] = marker_pvalues[1]
2041
		# 
2042
		st1_data = my_data[my_data$strain == strain_ref1,]
2043
		st1_aov = aov(indic~marker,st1_data)
2044
		st1_effects = model.tables(st1_aov)$tables
2045
		st1_pvalues = summary(st1_aov)[[1]]$Pr
2046
		# boxplot(indic~ma*st,st1_data)
2047
		anova_result[["st1_ma"]]=st1_effects$ma[inputs_name]
2048
		anova_result[["st1_ma_pvalue"]]=st1_pvalues[1]
2049
		# 
2050
		st2_data = my_data[my_data$strain == strain_ref2,]
2051
		st2_aov = aov(indic~marker,st2_data)
2052
		st2_effects = model.tables(st2_aov)$tables
2053
		st2_pvalues = summary(st2_aov)[[1]]$Pr
2054
		# boxplot(indic~ma*st,st2_data)
2055
		anova_result[["st2_ma"]]=st2_effects$ma[inputs_name]
2056
		anova_result[["st2_ma_pvalue"]]=st2_pvalues[1]
2057
		# 
2058
		correl_data = my_data
2059
		correl_aov = aov(indic~strain*marker,correl_data)
2060
		correl_effects = model.tables(correl_aov)$tables
2061
		correl_pvalues = summary(correl_aov)[[1]]$Pr
2062
		if (plot_anova_boxes) {
2063
			x11()
2064
			boxplot(indic~marker*strain,correl_data)
2065
		}
2066
		anova_result[["correl_st"]]=correl_effects$strain[1]
2067
		anova_result[["correl_ma"]]=correl_effects$ma[inputs_name]
2068
		anova_result[["correl_st_ma"]]=correl_effects$"strain:marker"[1,1]
2069
		anova_result[["correl_st_pvalue"]]=correl_pvalues[1]
2070
		anova_result[["correl_ma_pvalue"]]=correl_pvalues[2]
2071
		anova_result[["correl_st_ma_pvalue"]]=correl_pvalues[3]
2072
		
2073
		anova_results = dfadd(anova_results, anova_result)
2074
	}
2075
	return(anova_results)
2076
### Returns ANOVA results and comunted reads.
2077
}
2078

    
2079

    
2080
watch_samples = function(# Watching analysis of samples 
2081
### This function allows to view analysis for a particuler region of the genome.
2082
replicates, ##<< replicates under the form...
2083
read_length, ##<< length of the reads
2084
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome.
2085
plot_arrow_raw_reads = TRUE,  ##<< Plot (or not) arrows for raw reads.
2086
plot_arrow_nuc_reads = TRUE,  ##<< Plot (or not) arrows for reads aasiocied to a nucleosome.
2087
plot_squared_reads = TRUE,  ##<< Plot (or not) reads in the square fashion.
2088
plot_coverage = FALSE,  ##<< Plot (or not) reads in the covergae fashion. fashion.
2089
plot_gaussian_reads = TRUE,  ##<< Plot (or not) gaussian model of a F anf R reads.
2090
plot_gaussian_unified_reads = TRUE,  ##<< Plot (or not) gaussian model of a nuc.
2091
plot_ellipse_nucs = TRUE,  ##<< Plot (or not) ellipse for a nuc.
2092
change_col = TRUE, ##<< Change the color of each nucleosome.
2093
plot_wp_nucs = TRUE,  ##<< Plot (or not) cluster of nucs
2094
plot_wp_nuc_model = TRUE,  ##<< Plot (or not) gaussian model for a cluster of nucs
2095
plot_common_nucs = TRUE,  ##<< Plot (or not) aligned reads.
2096
plot_anovas = FALSE,  ##<< Plot (or not) scatter for each nuc.
2097
plot_anova_boxes =  FALSE,  ##<< Plot (or not) boxplot for each nuc.
2098
plot_wp_nucs_4_nonmnase = FALSE,  ##<< Plot (or not) clusters for non inputs samples.
2099
plot_chain = FALSE,  ##<< Plot (or not) clusterised nuceosomes between mnase samples.
2100
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed. 
2101
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed.
2102
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis.
2103
config=NULL ##<< GLOBAL config variable
2104
){
2105
  returned_list = list()
2106
  # Computing global display parameters
2107
  if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) {
2108
	  x_min_glo = replicates[[1]][[1]]$roi[["begin"]]
2109
	  x_max_glo = replicates[[1]][[1]]$roi[["end"]]
2110
  } else {
2111
	  x_min_glo = - replicates[[1]][[1]]$roi[["begin"]]
2112
	  x_max_glo = - replicates[[1]][[1]]$roi[["end"]]
2113
  }
2114
	base_glo = 0
2115
	nb_rank_glo = 0
2116
  for (samples in replicates) {
2117
  	nb_rank_glo = nb_rank_glo + length(samples)
2118
  }
2119
	ylim_glo = c(base_glo, base_glo + height * nb_rank_glo)
2120
	y_min_glo = min(ylim_glo)
2121
	y_max_glo = max(ylim_glo)
2122
  delta_y_glo = y_max_glo - y_min_glo
2123
  # Plot main frame
2124
  plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab="#reads (per million reads)", xlab=paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr), main="NucleoMiner2" )
2125
  axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2))
2126
  # Go
2127
	replicates_wp_nucs = list()
2128
  for (replicate_rank in 1:length(replicates)) {
2129
		# Computing replicate parameters
2130
		nb_rank = length(samples)
2131
		base = (replicate_rank-1) * height * nb_rank
2132
		ylim = c(base, base + height * nb_rank)
2133
		y_min = min(ylim)
2134
		y_max = max(ylim)
2135
	  delta_y = y_max - y_min
2136
		samples = replicates[[replicate_rank]]
2137
		for (sample_rank in 1:length(samples)) {
2138
			# computing sample parameters
2139
			sample = samples[[sample_rank]]
2140
			y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank
2141
			text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep=""))
2142

    
2143
		  if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2144
			  x_min = sample$roi[["begin"]]
2145
			  x_max = sample$roi[["end"]]
2146
		  } else {
2147
			  x_min = - sample$roi[["begin"]]
2148
			  x_max = - sample$roi[["end"]]
2149
		  }		
2150
			shift = x_min_glo - x_min
2151
	    # Plot Genome seq
2152
			if (plot_ref_genome) {
2153
		  	text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier")
2154
		  }
2155
			# Plot reads
2156
			reads = sample$inputs
2157
			signs = sign_from_strand(reads[,3])	
2158
			if (plot_arrow_raw_reads) {
2159
				arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, 
2160
				col=1, length=0.15/nb_rank)          	
2161
			}
2162
	    if (plot_squared_reads) {    	
2163
        # require(plotrix)
2164
				rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift,  y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0)
2165
			}
2166
	    if (plot_coverage) {    	
2167
        if (length(reads[,1]) != 0) {
2168
          step_h = sign(x_min) * signs * reads[,4]
2169
          step_b = sign(x_min) * reads[,2] + shift
2170
          step_e = sign(x_min) * (reads[,2] + signs * 150) + shift
2171
          steps_x = min(step_b, step_e):max(step_b, step_e)          
2172
          steps_y = rep(0, length(steps_x))
2173
          for (i in 1:length(step_h)) {          
2174
            steps_y[which(steps_x==min(step_b[i], step_e[i]))] =  steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i])
2175
            steps_y[which(steps_x==max(step_b[i], step_e[i]))] =  steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i])
2176
          }
2177
          tmp_index = which(steps_y != 0)
2178
          steps_x = steps_x[tmp_index]
2179
          steps_y = steps_y[tmp_index]
2180
          tmp_current_level = 0
2181
          for (i in 1:length(steps_y)) {          
2182
            steps_y[i] = tmp_current_level + steps_y[i]
2183
            tmp_current_level = steps_y[i]
2184
          }
2185
          steps_y = c(0, steps_y)
2186
          steps_y = steps_y * 1000000/sample$total_reads
2187
        } else {
2188
          steps_y = c(0, 0, 0)
2189
          steps_x = c(x_min, x_max)
2190
        }
2191
        # print(steps_x)
2192
        # print(steps_y)
2193
        lines(stepfun(steps_x, steps_y + y_lev), pch="")
2194
        abline(y_lev,0)
2195
        returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y)
2196
			}
2197
			# Plot nucs
2198
	    if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) {
2199
				nucs = sample$outputs
2200
				if (length(nucs$center) > 0) {
2201
					col = 1
2202
		      for (i in 1:length(nucs$center)) {
2203
            if (change_col) {
2204
  						col = col + 1              
2205
            }
2206
		        nuc = nucs[i,]
2207
						involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width)
2208
				  	involved_signs = apply(t(involved_reads[,3]), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
2209
						total_involved_reads = sum(involved_reads[,4]) 
2210
						if (plot_arrow_nuc_reads ) {
2211
							arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, 
2212
							col=col, length=0.15/nb_rank)          	
2213
						}
2214
	          if (plot_gaussian_reads | plot_gaussian_unified_reads) {
2215
  						flatted_reads = flat_reads(involved_reads, nuc$width)
2216
	  					delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
2217
		  			}
2218
	          if (plot_gaussian_reads ) {
2219
							flatted_reads = flat_reads(involved_reads, nuc$width)
2220
							delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
2221
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) * height/5 + y_lev, col=col)
2222
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) * height/5 + y_lev, col=col)
2223
	          }
2224
	          if (plot_gaussian_unified_reads ) {
2225
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) * height/5 + y_lev, col=col, lty=2)
2226
	          }
2227
	          if (plot_ellipse_nucs) {
2228
				      # require(plotrix)
2229
	  	 				draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width * height/5, border=col) 
2230
						}
2231
		      }
2232
		    } else {
2233
		      print("WARNING! No nucs to print.")
2234
		    }
2235
			}
2236
	  }
2237
    
2238
	  # Plot wp nucs
2239
		if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & (plot_wp_nucs | plot_common_nucs | plot_chain)) {
2240
			if (samples[[1]]$marker == "Mnase_Seq") {
2241
				if (is.null(aggregated_intra_strain_nucs)) {      
2242
	  			wp_nucs = aggregate_intra_strain_nucs(samples)[[1]]			
2243
				} else {
2244
					wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]]
2245
				}
2246
		  } else {
2247
  			wp_nucs = replicates_wp_nucs[[replicate_rank-2]]
2248
		  }
2249
      if (plot_chain) {
2250
        tf_nucs = lapply(wp_nucs, function(nuc) {      
2251
          bar = apply(t(nuc$nucs), 2, function(tmp_nuc){
2252
            tmp_nuc = tmp_nuc[[1]]
2253
            tmp_nuc$inputs = NULL
2254
            tmp_nuc$original_reads = NULL
2255
            tmp_nuc$wp = nuc$wp
2256
            # print(tmp_nuc)
2257
            return(tmp_nuc)
2258
          }) 
2259
          return(do.call(rbind, bar))
2260
        })
2261
        tf_nucs = data.frame(do.call(rbind, tf_nucs))
2262
        tmp_x = (unlist(tf_nucs$lower_bound) + unlist(tf_nucs$upper_bound)) / 2        
2263
        tmp_y =  y_min + (unlist(tf_nucs$track) - 0.5) * delta_y/nb_rank
2264
        tmp_y_prev = tmp_y[-length(tmp_y)]
2265
        tmp_y_next = tmp_y[-1]
2266
        tmp_y_inter = (tmp_y_prev + tmp_y_next) / 2
2267
        tmp_track = unlist(tf_nucs$track)
2268
        tmp_track_prev = tmp_track[-length(tmp_track)]
2269
        tmp_track_next = tmp_track[-1]
2270
        tmp_track_inter = signif(tmp_track_prev - tmp_track_next) * (abs(tmp_track_prev - tmp_track_next) > 1) * 25
2271
        tmp_x_prev = tmp_x[-length(tmp_x)]
2272
        tmp_x_next = tmp_x[-1]
2273
        need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50})
2274
        tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift
2275
        tmp_lod_inter =signif(unlist(tf_nucs$lod_score)[-1], 2)
2276
        new_tmp_x = c()               
2277
        new_tmp_y = c()              
2278
        index_odd = 1:length(tmp_x) * 2 - 1
2279
        index_even = (1:(length(tmp_x) - 1)) * 2 
2280
        new_tmp_x[index_odd] = tmp_x  
2281
        new_tmp_y[index_odd] = tmp_y
2282
        new_tmp_x[index_even] = tmp_x_inter
2283
        new_tmp_y[index_even] = tmp_y_inter
2284
        lines(new_tmp_x , new_tmp_y, lw=2)
2285
        points(tmp_x, tmp_y, cex=4, pch=16, col="white")
2286
        points(tmp_x, tmp_y, cex=4, lw=2)
2287
        text(tmp_x, tmp_y, 1:nrow(tf_nucs))
2288
        text(tmp_x_inter, tmp_y_inter, tmp_lod_inter, srt=90, cex=0.9, bg = "yellow")#, col=(tmp_lod_inter < 20) + 2)  
2289
      }
2290
  
2291
      if (plot_wp_nucs | plot_common_nucs ) {
2292
    		replicates_wp_nucs[[replicate_rank]] = wp_nucs
2293
  			for (wp_nuc in wp_nucs) {
2294
  				if (wp_nuc$wp){
2295
  					rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1)
2296
  					all_original_reads = c()
2297
  					for(initial_nuc in wp_nuc$nucs) {
2298
  						all_original_reads = c(all_original_reads, initial_nuc$original_reads)						
2299
  					}
2300
  					delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound
2301
  					if (FALSE) {
2302
  					  rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col="#EEEEEE", border=1)
2303
  				  }
2304
  					if (plot_wp_nuc_model) {
2305
  					  lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) * height/5 + y_min, col=1)				
2306
  				  }
2307
  				}
2308
  			}
2309
      }
2310
		}
2311
	}	
2312
  
2313
	if (plot_common_nucs | plot_anovas | plot_anova_boxes) {
2314
		if (is.null(aligned_inter_strain_nucs)) {
2315
			aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]]
2316
		}
2317
		if (plot_common_nucs) {
2318
      #Plot common wp nucs
2319
      mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list()
2320
            for (replicate_rank in 1:length(replicates)) {
2321
        nb_rank[[replicate_rank]] = length(samples)
2322
        base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]]
2323
        ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]])
2324
        y_min[[replicate_rank]] = min(ylim[[replicate_rank]])
2325
        y_max[[replicate_rank]] = max(ylim[[replicate_rank]])
2326
        delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]]
2327
        mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2
2328
      
2329
        samples = replicates[[replicate_rank]]
2330
        for (sample_rank in 1:length(samples)) {
2331
          sample = samples[[sample_rank]]
2332
          y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]]
2333
          if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2334
            x_min[[replicate_rank]] = sample$roi[["begin"]]
2335
            x_max[[replicate_rank]] = sample$roi[["end"]]
2336
          } else {
2337
            x_min[[replicate_rank]] = - sample$roi[["begin"]]
2338
            x_max[[replicate_rank]] = - sample$roi[["end"]]
2339
          }
2340
          shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]]
2341
        }
2342
      }  
2343
      for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
2344
        inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
2345
        tmp_xs = tmp_ys = c()
2346
        for (replicate_rank in 1:length(replicates)) {
2347
          samples = replicates[[replicate_rank]]
2348
          strain = samples[[1]]$strain
2349
          tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]])
2350
          tmp_ys = c(tmp_ys, mid_y[[replicate_rank]])
2351
        }
2352
        lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]]), cex=dev.size()[1]*200/(x_max[[1]]-x_min[[1]]), pch=19)          
2353
      }    
2354
		}
2355
		if (plot_anovas | plot_anova_boxes) {
2356
			anova_results = perform_anovas(replicates, aligned_inter_strain_nucs, plot_anova_boxes=plot_anova_boxes)
2357
			thres = FDR(anova_results[,"correl_st_ma_pvalue"],0.05)
2358
			if (is.na(thres)) {
2359
				# Boneferroni multiple test
2360
				thres = 0.05/length(anova_results[,1])
2361
			}
2362
			filtred_anova_results = anova_results[anova_results[,"correl_st_ma_pvalue"]<thres,]
2363
			x11()
2364
			if (plot_anovas) {
2365
			  plot(anova_results[,"mnase_st"],anova_results[,"correl_st_ma"], pch=1, xlim=c(-0.07,0.07), ylim=c(-0.07,0.07),main="SNEPs" ,xlab = "strain effect", ylab = "snep effect")
2366
	      points(x=filtred_anova_results[,"mnase_st"],y=filtred_anova_results[,"correl_st_ma"],col=2, pch="+")
2367
		  }
2368
 	  }
2369
	}
2370
  return(returned_list)
2371
}
2372

    
2373

    
2374

    
2375