Statistiques
| Branche: | Révision :

root / src / R / nucleominer.R @ 780f632a

Historique | Voir | Annoter | Télécharger (104,05 ko)

1
count_reads_cur = memoise(function(
2
marker,
3
combi,
4
form,
5
cur_index,
6
all_samples, ##<< A table that describe all our samples.
7
config=NULL ##<< GLOBAL config variable.
8
) {
9
	print(paste("Counting reads for ", form, " CUR ", cur_index, " in ", marker, " for [", combi[1], ",", combi[2], "].", sep=""))
10
	nucs = list()
11
	for (strain in combi) {
12
    if (form == "unr") {
13
  	  nucs[[strain]] = mread.table(paste(config$RESULTS_DIR, "/", strain, "_in_", combi[1], "_", combi[2], "_unr.tab", sep=""), header=TRUE)
14
    } else {
15
	    nucs[[strain]] = mread.table(paste(config$RESULTS_DIR, "/", strain, "_wp.tab", sep=""), header=TRUE)
16
    }
17
	}
18
	common_nucs = mread.table(paste(config$RESULTS_DIR, "/", combi[1], "_", combi[2], "_common_", form, ".tab", sep=""), header=TRUE)    
19
  if (form == "wp") {
20
    common_nucs = common_nucs[common_nucs$llr_score < 100, ]
21
  }
22
	common_nucs = common_nucs[ common_nucs$cur_index == cur_index, ]
23
	if ( length(common_nucs[,1]) != 0) {
24
		all_res = apply(t(1:length(common_nucs[,1])), 2, function(i){
25
    	if (i %% 10 == 1) {
26
    	  print(paste(i, "/", length(common_nucs[,1]), sep=""))	
27
      }
28
    	common_nuc = common_nucs[i,]		
29
    	res= list()
30
      # Get nuc infos
31
      for (strain in combi) {
32
    	  tmp_nuc_details =  nucs[[strain]][nucs[[strain]]$cur_index == common_nuc$cur_index & nucs[[strain]]$index_nuc == common_nuc[[paste("index_nuc",strain, sep="_")]], ]
33
        # print(tmp_nuc_details)
34
    		res[[paste("chr", strain, sep="_")]] = tmp_nuc_details$chr
35
    		res[[paste("lower_bound", strain, sep="_")]] = tmp_nuc_details$lower_bound
36
    		res[[paste("upper_bound", strain, sep="_")]] = tmp_nuc_details$upper_bound
37
    		res[[paste("index_nuc", strain, sep="_")]] = common_nuc[[paste("index_nuc",strain, sep="_")]]
38
      }	
39
    	res$cur_index =	common_nuc$cur_index
40
      # Get normalized reads
41
    	manip = marker
42
    	{			
43
    		for (strain in combi) {				
44
    			for(sample_id in all_samples[all_samples$marker == manip & all_samples$strain == strain, ]$id) {
45
            # tmp_filename = paste(config$ALIGN_DIR, "/TF/splited/sample_", sample, "_chr_", res[[paste("chr", strain, sep="_")]], "_splited_sample.tab.gz",sep="")
46
            if ("tf_input" %in% names(all_samples)) {
47
              sample_inputs_filename = all_samples$tf_input[all_samples$id==sample_id]
48
            } else {
49
              sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", sample_id, "_TF.txt", sep="")
50
            }
51
    				tmp_unnorm_reads = mread.table(sample_inputs_filename, stringsAsFactors=FALSE)					
52
            names(tmp_unnorm_reads) = c("chr", "pos", "strand", "nb_reads")
53
    				orig_cur = list(name = "foo", 
54
                                begin = res[[paste("lower_bound", strain, sep="_")]], 
55
    														end = res[[paste("upper_bound", strain, sep="_")]], 
56
    														chr = res[[paste("chr", strain, sep="_")]], 
57
    														strain_ref = strain)
58
    				orig_cur$length = orig_cur$end - orig_cur$begin + 1
59
    				tmp_nuc_reads = filter_tf_inputs(tmp_unnorm_reads, orig_cur$chr, orig_cur$begin, orig_cur$end, orig_cur$length) 
60
    				res[[paste(strain, manip, sample_id, sep="_")]] = sum(tmp_nuc_reads[,4])
61
    			}
62
    		}
63
    	}
64
    	res
65
    })   
66
		return(all_res)
67
	} else {
68
		return(NULL)
69
	}
70
})
71

    
72
build_count_table = function(# Build count table for a set of samples.
73
### This function build a count table for a set of sample.
74
marker, ##<< The marker that we want to build the count table.
75
combi, ##<< The combinations of strains that we want to build the count table.
76
form, ##<< The nucleosome that we want to observe: "wp" for sel;l position and "unr" for UNR.
77
curs, ##<< The list of CURs
78
all_samples, ##<< A table that describe all our samples.
79
config=NULL ##<< GLOBAL config variable.
80
) {
81
  dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE)
82
	all_res = apply(t(1:nrow(curs)), 2, function(cur_index) {
83
    res = count_reads_cur(marker=marker, combi=combi, form=form, cur_index=cur_index, all_samples, config)
84
		return(res)
85
	})
86
	vec_names = names(all_res[[1]][[1]])
87
	all_res = data.frame(matrix(unlist(all_res), ncol= length(vec_names), byrow=TRUE))
88
  names(all_res) = vec_names
89
	out_filename = paste(config$RESULTS_DIR, "/", combi[1], "_", combi[2], "_", marker, "_", form, "_and_nbreads.tab",sep="")
90
	write.table(all_res, file=out_filename, row.names=FALSE, quote=FALSE)				
91
}
92

    
93

    
94

    
95

    
96

    
97

    
98
extract_maps = memoise(function(# Extract maps from TemplateFilter  outputs
99
### This function extracts from TemplateFilter outputs for a given CUR./ This is from there that aggregate_intra_strain_nucs and align_inter_strain_nucs fucntions are calles. This is an internal fucntion
100
cur_index, ##<< The region of interest index.
101
strains, ##<< The strains for which we want to extract intra strain information.
102
combis, ##<< All combinations of strains for which we want to extract inter strain information.
103
all_samples, ##<< A table that describe all our samples.
104
config=NULL##<< GLOBAL config variable.
105
) {
106
  cur = as.list(curs[cur_index,])
107
  print(paste("cur length", cur$length))
108

    
109
  mnase_replicates = list()
110
  aggregated_intra_strain_nucs = list()
111
  for (strain in strains) {
112
    mnase_replicates[[strain]] = fetch_mnase_replicates(strain, as.list(curs[cur_index,]), all_samples, config)
113
    aggregated_intra_strain_nucs[[strain]] = aggregate_intra_strain_nucs(mnase_replicates[[strain]])
114
  }
115

    
116
  aligned_inter_strain_nucs = list()
117
  for(combi in combis) {
118
    if (!(length(aggregated_intra_strain_nucs[[combi[1]]][[1]]) == 0 | length(aggregated_intra_strain_nucs[[combi[2]]][[1]]) == 0 )) {
119
      aligned_inter_strain_nucs[[paste(combi[1], combi[2], sep="_")]] = align_inter_strain_nucs(replicates = list(mnase_replicates[[combi[1]]], mnase_replicates[[combi[2]]]),
120
                                                                                                wp_nucs_strain_ref1 = aggregated_intra_strain_nucs[[combi[1]]][[1]],
121
                                                                                                wp_nucs_strain_ref2 = aggregated_intra_strain_nucs[[combi[2]]][[1]], config = config)
122
    } else {
123
      print("WARNING! no aggregated_intra_strain_nucs.")
124
      aligned_inter_strain_nucs[[paste(combi[1], combi[2], sep="_")]] = NULL
125
    }
126
  }
127

    
128
  return(list(aggregated_intra_strain_nucs, aligned_inter_strain_nucs))
129
})
130

    
131

    
132

    
133
build_maps = function(# Extract maps from TemplateFilter  outputs
134
### This function extracts from TemplateFilter outputs./ This is from there that aggregate_intra_strain_nucs and align_inter_strain_nucs fucntions are calles. This fucntion write well positionned, fuzzy and both maps in the config$RESULTS_DIR directory.
135
strains, ##<< The strains for which we want to extract intra strain information.
136
combis, ##<< The combinations of strains for which we want to extract inter strain information.
137
all_samples, ##<< A table that describe all our samples.
138
curs, ##<< The list of CURs
139
config=NULL##<< GLOBAL config variable.
140
) {
141
  dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE)
142
  ### build wp maps
143
	glo_results = apply(t(1:1:nrow(curs)), 2, function(cur_index) {
144
  	dyad_shift = wp_llr = strain_maps = common_nuc_results = intra_llrs = inter_llrs = list()
145
		print(paste("Collecting maps", cur_index , "/", nrow(curs)))
146
		extracted_maps = extract_maps(cur_index, strains, combis, all_samples, config)
147

    
148
		aggregated_intra_strain_nucs = extracted_maps[[1]]
149
		for (strain in strains) {
150
			print(paste("Collecting maps for", strain, "..."))
151
			partial_strain_maps = aggregated_intra_strain_nucs[[strain]][[1]]
152
      print(strain)
153
      nb_tracks = sum(all_samples$strain == strain & all_samples$marker == "Mnase_Seq")
154
      
155
      tmp_nuc_map = flat_aggregated_intra_strain_nucs(partial_strain_maps, cur_index, nb_tracks)
156

    
157
      wp_indexes = which(tmp_nuc_map$wp == 1)
158
      tmp_wp_llr = apply(t(wp_indexes), 2, function(wp_index){
159
        tmp_wp = partial_strain_maps[[wp_index]]
160
        l = lapply(tmp_wp$nucs, function(tmp_wp_nucs){
161
          tmp_wp_nucs$original_reads
162
        })  
163
        res = llr_score_nvecs(l)
164
        return(res)
165
      })
166

    
167
      tmp_dyad_shift = apply(t(wp_indexes), 2, function(wp_index){
168
        tmp_wp = partial_strain_maps[[wp_index]]  
169
        c = sapply(tmp_wp$nucs, function(tmp_wp_nucs){
170
          tmp_wp_nucs$center
171
        })  
172
        ds = max(c) - min(c)
173
        return(ds)
174
      })
175

    
176
      nb_rep = length(partial_strain_maps[[wp_indexes[1]]]$nucs)
177

    
178
      tmp_nuc_map$wp_llr = rep(NA,nrow(tmp_nuc_map))
179
      tmp_nuc_map[which(tmp_nuc_map$wp == 1),]$wp_llr = signif(tmp_wp_llr,5)
180
      tmp_nuc_map$wp_pval = signif(1-pchisq(2*tmp_nuc_map$wp_llr, df=(nb_rep * 2) - 2),5)
181
      tmp_nuc_map$dyad_shift = rep(NA,nrow(tmp_nuc_map))
182
      tmp_nuc_map[which(tmp_nuc_map$wp == 1),]$dyad_shift = tmp_dyad_shift
183
      
184
      strain_maps[[strain]] = tmp_nuc_map
185
      intra_llrs[[strain]] = aggregated_intra_strain_nucs[[strain]][[2]]
186
		}
187

    
188
    for(combi in combis) {
189
      print(paste("Collecting results for", combi[1], combi[2], "..."))
190
      partial_common_nuc_results = extracted_maps[[2]][[paste(combi[1], combi[2], sep="_")]][[1]]
191
      if (length(partial_common_nuc_results) > 0) {
192
        partial_common_nuc_results$cur_index = rep(cur_index, length(partial_common_nuc_results[,1]))
193
        # partial_common_nuc_results$common_wp_pval = signif(1-pchisq(2*partial_common_nuc_results$llr_score, df=2),5)
194
        common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = partial_common_nuc_results
195
      } else {
196
        print(paste("No partial_common_nuc_results for cur", cur_index, "ands combi", combi[1], combi[2], "." ))
197
        common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = data.frame()
198
      }
199
      inter_llrs[[paste(combi[1], combi[2], sep="_")]] = extracted_maps[[2]][[paste(combi[1], combi[2], sep="_")]][[2]]
200
    }
201

    
202
    return(list(strain_maps, common_nuc_results, intra_llrs, inter_llrs))
203
  })
204

    
205
  glo_results = do.call("rbind", glo_results)
206

    
207
  llrs_intra = do.call("rbind", glo_results[,3])
208
  llrs_inter = do.call("rbind", glo_results[,4])
209

    
210
  for (strain in strains) {
211
    outpout_joint_filename = paste(config$RESULTS_DIR, "/", strain, "_llrs.tab", sep="")
212
    write.table(unlist(llrs_intra[,strain]), file=outpout_joint_filename, row.names=FALSE, quote=FALSE)
213
  }
214

    
215
  for(combi in combis) {
216
    outpout_joint_filename = paste(config$RESULTS_DIR, "/", combi[1], "_", combi[2], "_llrs.tab", sep="")
217
    write.table(unlist(llrs_inter[,paste(combi[1], combi[2], sep="_")]), file=outpout_joint_filename, row.names=FALSE, quote=FALSE)
218
  }
219

    
220
  glo_strain_maps = do.call("rbind", glo_results[,1])
221
  for (strain in strains) {
222
      nuc_map_filename = paste(config$RESULTS_DIR, "/", strain, "_wp.tab", sep="")
223
      write.table(do.call("rbind", glo_strain_maps[,strain]), file=nuc_map_filename, row.names=FALSE, quote=FALSE)
224
  }
225

    
226
  glo_common_nuc_results = do.call("rbind", glo_results[,2])
227
  for(combi in combis) {
228
    outpout_joint_filename = paste(config$RESULTS_DIR, "/", combi[1], "_", combi[2], "_common_wp.tab", sep="")
229
    write.table(do.call("rbind", glo_common_nuc_results[,paste(combi[1], combi[2], sep="_")])[, c("cur_index", paste("index_nuc", combi[1], sep="_"), paste("index_nuc", combi[2], sep="_"), "llr_score", "common_wp_pval", "diff")], file=outpout_joint_filename, row.names=FALSE, quote=FALSE)
230
  }
231

    
232
  ### build fuzzy maps
233
    wp_maps  = list()
234
    for (strain in strains) {
235
      wp_map_filename = paste(config$RESULTS_DIR, "/", strain, "_wp.tab", sep="")
236
      wp_maps[[strain]] = mread.table(wp_map_filename, header=TRUE)
237
    }
238
    glo_fuzzy = list()
239
    for (strain in strains) {
240
      tmp_wp = wp_maps[[strain]]
241
    	glo_fuzzy[[strain]] = apply(t(1:1:nrow(curs)), 2, function(cur_index) {
242
        print(paste("Building fuzzy for", strain, cur_index, "/", nrow(curs)))
243
        tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
244
        tmp_fuzzy = get_intra_strain_fuzzy(tmp_wp, as.list(curs[cur_index,]), strain, config=config)
245
        return(tmp_fuzzy)
246
      })
247
      fuzzy_map_filename = paste(config$RESULTS_DIR, "/", strain, "_fuzzy.tab", sep="")
248
      write.table(do.call("rbind", glo_fuzzy[[strain]]), fuzzy_map_filename, row.names=FALSE, quote=FALSE)
249
    }
250

    
251
  ### build unr maps
252
  wp_maps  = list()
253
  for (strain in strains) {
254
    wp_map_filename = paste(config$RESULTS_DIR, "/", strain, "_wp.tab", sep="")
255
    wp_maps[[strain]] = mread.table(wp_map_filename, header=TRUE)
256
  }
257
  fuzzy_maps = list()
258
  for (strain in strains) {
259
    fuzzy_map_filename = paste(config$RESULTS_DIR, "/", strain, "_fuzzy.tab", sep="")
260
    fuzzy_maps[[strain]] = mread.table(fuzzy_map_filename, header=TRUE)
261
  }
262
  common_nuc_results = list()
263
  for(combi in combis) {
264
    outpout_joint_filename = paste(config$RESULTS_DIR, "/", combi[1], "_", combi[2], "_common_wp.tab", sep="")
265
    tmp_common_wp = mread.table(outpout_joint_filename, header=TRUE)
266
    common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = tmp_common_wp
267
  }
268

    
269
  unr = list()
270
  for(combi in combis) {
271
    unr[[paste(combi[1], combi[2], sep="_")]] = do.call("rbind", apply(t(1:1:nrow(curs)), 2, function(cur_index) {
272
      print(paste("Building UNRs", combi[1], combi[2], cur_index, "/", nrow(curs)))
273
      cur = as.list(curs[cur_index,])
274
      get_unrs(combi, cur, cur_index, wp_maps, fuzzy_maps, common_nuc_results, config=config)
275
    }))
276

    
277
    # unr for strain 1
278
  	unr_map_filename_1 = paste(config$RESULTS_DIR, "/", combi[1], "_in_", combi[1], "_", combi[2], "_unr.tab", sep="")
279
  	write.table(unr[[paste(combi[1], combi[2], sep="_")]][, c("chr", "lower_bound", "upper_bound", "cur_index", "index_nuc")], file=unr_map_filename_1, row.names=FALSE, quote=FALSE)
280

    
281
    # unr for strain 2
282
    print(paste(combi[1], combi[2], "translation..."))
283
    to_tr_unr = unr[[paste(combi[1], combi[2], sep="_")]]
284
    tr_unr = apply(t(1:length(to_tr_unr[,1])), 2, function(i){
285
      if (i %% 100 == 1) {print(paste(combi[1], combi[2], "translation... ", i , "/", length(to_tr_unr[,1])))}
286
      tmp_unr = to_tr_unr[i,]
287
      tmp_reg = list(name="foo", begin=tmp_unr$lower_bound, end=tmp_unr$upper_bound, chr=tmp_unr$chr, strain_ref = combi[1])
288
      big_cur = as.list(curs[tmp_unr$cur_index,])
289
      trs_tmp_reg = translate_cur(tmp_reg, combi[2], config=config, big_cur=big_cur)
290
      if (!is.null(trs_tmp_reg)) {
291
        to_tr_unr[i,]$chr = trs_tmp_reg$chr
292
        to_tr_unr[i,]$lower_bound = min(trs_tmp_reg$begin, trs_tmp_reg$end)
293
        to_tr_unr[i,]$upper_bound = max(trs_tmp_reg$begin, trs_tmp_reg$end)
294
        return(to_tr_unr[i,])
295
        } else {
296
          return(NULL)
297
        }
298
      })
299
    tr_unr = do.call("rbind", tr_unr)
300
    # write it.
301
  	unr_map_filename_2 = paste(config$RESULTS_DIR, "/", combi[2], "_in_", combi[1], "_", combi[2], "_unr.tab", sep="")
302
  	write.table(tr_unr[, c("chr", "lower_bound", "upper_bound", "cur_index", "index_nuc")], file=unr_map_filename_2, row.names=FALSE, quote=FALSE)
303

    
304
    # joint the two tables
305
    common_unr = cbind(tr_unr$cur_index, tr_unr$index_nuc, tr_unr$index_nuc)
306
    common_unr = data.frame(common_unr, stringsAsFactors=FALSE)
307
    names(common_unr) = c("cur_index", paste("index_nuc", combi[1], sep="_"), paste("index_nuc", combi[2], sep="_"))
308
    outpout_joint_unr_filename = paste(config$RESULTS_DIR, "/", combi[1], "_", combi[2], "_common_unr.tab", sep="")
309
    write.table(common_unr, file=outpout_joint_unr_filename, row.names=FALSE, quote=FALSE)
310
  }
311

    
312
}
313

    
314

    
315

    
316

    
317

    
318

    
319
FDR = structure(function#  False Discovery Rate
320
### From a vector x of independent p-values, extract the cutoff corresponding to the specified FDR. See Benjamini & Hochberg 1995 paper
321
##author<< Gael Yvert,
322
(
323
x, ##<< A vector x of independent p-values.
324
FDR ##<< The specified FDR.
325
) {
326
  x <- sort(na.omit(x))
327
  N = length(x)
328
  i = 1;
329
  while(N*x[i]/i < FDR & i <= N) i = i + 1; # we search for the highest i where Nrandom / Nobserved < FDR
330
  if (i == 1)
331
    return (NA)
332
  else
333
    return( x[i-1] )
334
### Return the the corresponding cutoff.
335
}, ex=function(){
336
  print("example")
337
})
338

    
339

    
340

    
341
llr_score_nvecs = structure(function # Likelihood ratio
342
### Compute the log likelihood ratio of two or more set of value.
343
(
344
  xs ##<< list of vectors.
345
) {
346
  l = length(xs)
347
  if (l < 1) {
348
    return(NA)
349
  }
350
  if (l == 1) {
351
    return(1)
352
  }
353
  sumllX = 0
354
  for (i in 1:l) {
355
    x = xs[[i]]
356
  	if (length(x) <= 1) {
357
  		return(NA)
358
  	}
359
    meanX = mean(x)
360
    sdX = sd(x)
361
    llX = sum(log(dnorm(x,mean=meanX,sd=sdX)))
362
    sumllX = sumllX + llX
363
  }
364
  meanXglo = mean(unlist(xs))
365
  sdXglo = sd(unlist(xs))
366
  llXYZ = sum(log(dnorm(unlist(xs),mean=meanXglo,sd=sdXglo)))
367
  ratio = sumllX - llXYZ
368
  return(ratio)
369
### Returns the log likelihood ratio.
370
}, ex=function(){
371
  # LLR score for 2 set of values
372
  mean1=5; sd1=2; card2 = 250
373
  mean2=6; sd2=3; card1 = 200
374
  x1 = rnorm(card1, mean1, sd1)
375
  x2 = rnorm(card2, mean2, sd2)
376
  min = floor(min(c(x1,x2)))
377
  max = ceiling(max(c(x1,x2)))
378
  hist(c(x1,x2), xlim=c(min, max), breaks=min:max)
379
  lines(min:max,dnorm(min:max,mean1,sd1)*card1,col=2)
380
  lines(min:max,dnorm(min:max,mean2,sd2)*card2,col=3)
381
  lines(min:max,dnorm(min:max,mean(c(x1,x2)),sd(c(x1,x2)))*card2,col=4)
382
  llr_score_nvecs(list(x1,x2))
383
 })
384

    
385
dfadd = structure(function# Adding list to a dataframe.
386
### Add a list \emph{l} to a dataframe \emph{df}. Create it if \emph{df} is \emph{NULL}. Return the dataframe \emph{df}.
387
	(df, ##<<  A dataframe
388
		l ##<<  A list
389
	) {
390
  if (is.null(df)) {
391
    df = data.frame(l,stringsAsFactors=FALSE)
392
  } else {
393
    df = rbind(df, data.frame(l,stringsAsFactors=FALSE))
394
  }
395
  return(df)
396
### Return the dataframe \emph{df}.
397
}, ex=function(){
398
		## Here dataframe is NULL
399
		print(df)
400
		df = NULL
401

    
402
		# Initialize df
403
		df = dfadd(df, list(key1 = "value1", key2 = "value2"))
404
		print(df)
405

    
406
		# Adding elements to df
407
		df = dfadd(df, list(key1 = "value1'", key2 = "value2'"))
408
		print(df)
409
})
410

    
411

    
412
sign_from_strand = function(
413
### Get the sign of strand
414
strands) {
415
	apply(t(strands), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
416
### If strand in forward then returns 1 else returns -1
417
}
418

    
419
flat_reads = function(
420
### Extract reads coordinates from TempleteFilter input sequence
421
reads, ##<< TemplateFilter input reads
422
nuc_width ##<< Width used to shift F and R reads.
423
) {
424
	F_flatted_reads = unlist(apply(t(reads[reads$V3=="F",]),2,function(r){rep(as.integer(r[2]), r[4])}))
425
	R_flatted_reads = unlist(apply(t(reads[reads$V3=="R",]),2,function(r){rep(as.integer(r[2]), r[4])}))
426
	flatted_reads = c(F_flatted_reads + rep(nuc_width/2, length(F_flatted_reads)), R_flatted_reads - rep(nuc_width/2, length(R_flatted_reads))  )
427
	return(list(F_flatted_reads, R_flatted_reads, flatted_reads))
428
### Returns a list of F reads, R reads and joint/shifted F and R reads.
429
}
430

    
431

    
432

    
433
filter_tf_outputs = function(# Filter TemplateFilter outputs
434
### This function filters TemplateFilter outputs according, not only genome area observerved properties, but also correlation and overlapping threshold.
435
tf_outputs, ##<< TemplateFilter outputs.
436
chr, ##<< Chromosome observed, here chr is an integer.
437
x_min, ##<< Coordinate of the first bp observed.
438
x_max, ##<< Coordinate of the last bp observed.
439
nuc_width = 160, ##<< Nucleosome width.
440
ol_bp = 59, ##<< Overlap Threshold.
441
corr_thres = 0.5 ##<< Correlation threshold.
442
) {
443
  if (x_min < 0) {
444
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= -x_max & tf_outputs$center + tf_outputs$width/2 <=  -x_min,]
445
	} else {
446
    tf_outputs = tf_outputs[tf_outputs$chr == paste("chr", chr, sep="") & tf_outputs$center - tf_outputs$width/2 >= x_min & tf_outputs$center + tf_outputs$width/2 <= x_max,]
447
  }
448
  tf_outputs$lower_bound = tf_outputs$center - tf_outputs$width/2
449
  tf_outputs$upper_bound = tf_outputs$center + tf_outputs$width/2
450
  tf_outputs = tf_outputs[tf_outputs$correlation.score >= corr_thres,]
451
  tf_outputs = tf_outputs[order(tf_outputs$correlation.score, decreasing=TRUE),]
452
  i = 1
453
  while (i <= length(tf_outputs[,1])) {
454
    lb = tf_outputs[i,]$lower_bound
455
    ub = tf_outputs[i,]$upper_bound
456
    tf_outputs = tf_outputs[!(tf_outputs$lower_bound <= (ub-ol_bp) & tf_outputs$upper_bound > ub) & !(tf_outputs$upper_bound >= (lb+ol_bp) & tf_outputs$lower_bound < lb),]
457
    i = i+1
458
  }
459
  return(tf_outputs)
460
### Returns filtered TemplateFilter Outputs
461
}
462

    
463

    
464

    
465
filter_tf_inputs = function(# Filter TemplateFilter inputs
466
### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
467
inputs, ##<< TF inputs to be filtered.
468
chr, ##<< Chromosome observed, here chr is an integer.
469
x_min, ##<< Coordinate of the first bp observed.
470
x_max, ##<< Coordinate of the last bp observed.
471
nuc_width = 160, ##<< Nucleosome width.
472
only_f = FALSE, ##<< Filter only F reads.
473
only_r = FALSE, ##<< Filter only R reads.
474
filter_for_coverage = FALSE ##<< Does it filter for plot coverage?
475
) {
476
	if (only_f) {
477
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
478
	} else if (only_r) {
479
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]			
480
	} else {
481
		inputs = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
482
	}
483
  if (!filter_for_coverage) {
484
    corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
485
    inputs = inputs[inputs[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]    
486
  }
487
	return(inputs)
488
### Returns filtred inputs.
489
}
490

    
491

    
492

    
493
# filter_tf_inputs = function(# Filter TemplateFilter inputs
494
# ### This function filters TemplateFilter inputs according genome area observed properties. It takes into account reads that are at the frontier of this area and the strand of these reads.
495
# inputs, ##<< TF inputs to be filtered.
496
# chr, ##<< Chromosome observed, here chr is an integer.
497
# x_min, ##<< Coordinate of the first bp observed.
498
# x_max, ##<< Coordinate of the last bp observed.
499
# nuc_width = 160, ##<< Nucleosome width.
500
# only_f = FALSE, ##<< Filter only F reads.
501
# only_r = FALSE, ##<< Filter only R reads.
502
# filter_for_coverage = FALSE, ##<< Does it filter for plot coverage?
503
# USE_DPLYR = FALSE ##<< Use dplyr lib to filter reads.
504
# ) {
505
#   n = names(inputs)
506
#
507
#   if (!USE_DPLYR) {
508
#     if (only_f) {
509
#       inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "F" & inputs[,2] <= x_max + nuc_width,]
510
#     } else if (only_r) {
511
#       inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,3] == "R" & inputs[,2] <= x_max + nuc_width,]
512
#     } else {
513
#       inputs_out = inputs[inputs[,1]==chr & inputs[,2] >= x_min - nuc_width & inputs[,2] <= x_max + nuc_width,]
514
#     }
515
#   } else {
516
#     names(inputs) = c("chr", "pos", "str", "lev")
517
#     if (only_f) {
518
#       inputs_out = filter(inputs, chr == chr,  pos >= x_min - nuc_width, str == "F", pos <= x_max + nuc_width)
519
#     } else if (only_r) {
520
#       inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, str == "R" & pos <= x_max + nuc_width)
521
#     } else {
522
#       inputs_out = filter(inputs, chr == chr, pos >= x_min - nuc_width, pos <= x_max + nuc_width)
523
#     }
524
#       # if (!filter_for_coverage) {
525
#       #   inputs$corrected_inputs_coords = inputs[,2] + nuc_width/2 * sign_from_strand(inputs[,3])
526
#       #   inputs = filter(inputs, chr == chr, corrected_inputs_coords >= x_min, corrected_inputs_coords <= x_max)
527
#       #   inputs$corrected_inputs_coords = NULL
528
#       # }
529
#   }
530
#
531
#   if (!filter_for_coverage) {
532
#     corrected_inputs_coords = inputs_out[,2] + nuc_width/2 * sign_from_strand(inputs_out[,3])
533
#     inputs_out = inputs_out[inputs_out[,1]==chr & corrected_inputs_coords >= x_min & corrected_inputs_coords <= x_max,]
534
#   }
535
#
536
#   names(inputs_out) = n
537
#   return(inputs_out)
538
# ### Returns filtred inputs.
539
# }
540

    
541

    
542

    
543
get_comp_strand = function(
544
### Compute the complementatry strand.
545
strand ##<< The original strand.
546
) {
547
	apply(t(strand),2, function(n){
548
	  if (n=="a") {return("t")}
549
		if (n=="t") {return("a")}
550
		if (n=="c") {return("g")}
551
		if (n=="g") {return("c")}
552
	})
553
### Returns the complementatry strand.
554
}
555

    
556

    
557
aggregate_intra_strain_nucs = structure(function(# Aggregate replicated sample's nucleosomes.
558
### This function aggregates nucleosomes from replicated samples. It uses TemplateFilter ouput of each sample as replicate. Each sample owns a set of nucleosomes computed using TemplateFilter and ordered by the position of their center (dyad). A chain of nucleosomes is builts across all replicates. Adjacent nucleosomes of the chain are compared two by two. Comparison is based on a log likelihood ratio (LLR1). depending on the LLR1 value nucleosomes are merged (low LLR) or separated (high LLR). Finally the function returns a list of clusters and all computed llr_scores. Each cluster ows an attribute wp for "well positioned". This attribute is set to TRUE if the cluster is composed of exactly one nucleosome of each sample.
559
samples, ##<< A list of samples. Each sample is a list like \emph{sample = list(id=..., marker=..., strain=..., roi=..., inputs=..., outputs=...)} with \emph{roi = list(name=..., begin=...,  end=..., chr=..., genome=...)}.
560
llr_thres=20, ##<< Log likelihood ratio threshold to decide between merging and separating
561
coord_max=20000000 ##<< A too big value to be a coord for a nucleosome lower bound.
562
){
563
	end_of_tracks = function(tracks) {
564
		if (length(tracks) == 0) {
565
			return(TRUE)
566
		}
567
	  for (lower_bound in tracks) {
568
			if(!is.na(lower_bound)) {
569
	      if (lower_bound < coord_max) {
570
	        return(FALSE)
571
	      }
572
	  	}
573
	  }
574
	  return(TRUE)
575
	}
576
	store_cluster = function(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center) {
577
		if ( nb_nucs_in_cluster==nb_tracks & sum(nuc_from_track)==nb_tracks) {
578
			new_cluster$wp = TRUE
579
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
580
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
581
		  	clusters[[length(clusters) + 1]] = new_cluster
582
				# print(new_cluster)
583
		  }
584
		} else {
585
			new_cluster$wp = FALSE
586
			center = (new_cluster$lower_bound + new_cluster$upper_bound) / 2
587
			if (is.null(min_nuc_center) | ((min_nuc_center <= center) & (center < max_nuc_center))) {
588
			  clusters[[length(clusters) + 1]] = new_cluster
589
			}
590
		}
591
		return(clusters)
592
	}
593
	strain = samples[[1]]$strain
594
	llr_scores = c()
595
  min_nuc_center = min(samples[[1]]$roi$begin, samples[[1]]$roi$end)
596
	max_nuc_center = max(samples[[1]]$roi$begin, samples[[1]]$roi$end)
597
  # compute clusters
598
  clusters = list()
599
  cluster_contents = list()
600
  # Init reader
601
  indexes = c()
602
  track_readers = c()
603
  current_nuc = NULL
604
	llr_score = llr_thres + 1
605
  # Read nucs from TF outputs
606
  tf_outs = list()
607
	i = 1
608
  for (sample in samples) {
609
		tf_outs[[i]] = sample$outputs
610
		tf_outs[[i]] = tf_outs[[i]][order(tf_outs[[i]]$center),]
611
    indexes[i] = 1
612
		if (is.na(tf_outs[[i]][indexes[i],]$center)) {
613
      track_readers[i] = coord_max
614
	  } else {
615
      track_readers[i] = tf_outs[[i]][indexes[i],]$center
616
		}
617
		i = i+1
618
  }
619
  nb_tracks = length(tf_outs)
620
	# print(track_readers)
621
  new_cluster = NULL
622
  nb_nucs_in_cluster = 0
623
  nuc_from_track = c()
624
  for (i in 1:nb_tracks){
625
    nuc_from_track[i] = FALSE
626
  }
627
  # Start clustering
628
  while (!end_of_tracks(track_readers)) {
629
    new_center = min(track_readers)
630
		current_track = which(track_readers == new_center)[1]
631
    new_nuc = as.list(tf_outs[[current_track]][indexes[current_track],])
632
		new_nuc$chr = substr(new_nuc$chr,4,1000000L)
633
		new_nuc$inputs = samples[[current_track]]$inputs
634
		new_nuc$chr = samples[[current_track]]$roi$chr
635
		new_nuc$track = current_track
636

    
637
		new_nuc$inputs = filter_tf_inputs(samples[[current_track]]$inputs, new_nuc$chr, new_nuc$lower_bound, new_nuc$upper_bound, new_nuc$width)
638
		flatted_reads = flat_reads(new_nuc$inputs, new_nuc$width)
639
		new_nuc$original_reads = flatted_reads[[3]]
640

    
641
    new_upper_bound = new_nuc$upper_bound
642

    
643
    if (!is.null(current_nuc)) {
644
			llr_score = llr_score_nvecs(list(current_nuc$original_reads,new_nuc$original_reads))
645
			llr_scores = c(llr_scores,llr_score)
646
		}
647
		# print(paste(llr_score, length(current_nuc$original_reads), length(new_nuc$original_reads), sep=" "))
648
		if (is.na(llr_score)) {
649
			llr_score = llr_thres + 1
650
		}
651
		# Store llr_score
652
		new_nuc$llr_score = llr_score
653
	  if (llr_score < llr_thres) {
654
      # aggregate to current cluster
655
      #   update bound
656
      if (new_nuc$upper_bound > new_cluster$upper_bound) {
657
        new_cluster$upper_bound = new_nuc$upper_bound
658
      }
659
      if (new_nuc$lower_bound < new_cluster$lower_bound) {
660
        new_cluster$lower_bound = new_nuc$lower_bound
661
      }
662
      #   add nucleosome to current cluster
663
      nuc_from_track[current_track] = TRUE
664
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
665
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
666
    } else {
667
			if (!is.null(new_cluster)) {
668
        # store old cluster
669
	      clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center)
670
			}
671
      # Reinit current cluster composition stuff
672
      nb_nucs_in_cluster = 0
673
      nuc_from_track = c()
674
      for (i in 1:nb_tracks){
675
        nuc_from_track[i] = FALSE
676
      }
677
      # create new cluster
678
      new_cluster = list(lower_bound=new_nuc$low, upper_bound=new_nuc$up, chr=new_nuc$chr, strain_ref=strain , nucs=list())
679
      # update upper bound
680
      current_upper_bound = new_upper_bound
681
      # add nucleosome to current cluster
682
      nb_nucs_in_cluster = nb_nucs_in_cluster + 1
683
      nuc_from_track[current_track] = TRUE
684
			new_cluster$nucs[[length(new_cluster$nucs)+1]] = new_nuc
685

    
686
		}
687

    
688
		current_nuc = new_nuc
689

    
690
    # update indexes
691
    if (indexes[current_track] < length(tf_outs[[current_track]]$center)) {
692
      indexes[current_track] = indexes[current_track] + 1
693
      # update track
694
      track_readers[current_track] = tf_outs[[current_track]][indexes[current_track],]$center
695
    } else {
696
      # update track
697
      track_readers[current_track] = coord_max
698
    }
699
  }
700
  # store last cluster
701
  if (!is.null(new_cluster)) {
702
    # store old cluster
703
    clusters = store_cluster(clusters, new_cluster, nb_nucs_in_cluster, nuc_from_track, nb_tracks, min_nuc_center, max_nuc_center)
704
  }
705
	return(list(clusters, llr_scores))
706
### Returns a list of clusterized nucleosomes, and all computed llr scores.
707
}, ex=function(){
708
	# Dealing with a region of interest
709
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301))
710
	samples = list()
711
	for (i in 1:3) {
712
		# Create TF output
713
		tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
714
		outputs = dfadd(NULL,tf_nuc)
715
		outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
716
		# Generate corresponding reads
717
		nb_reads = round(runif(1,170,230))
718
		reads = round(rnorm(nb_reads, tf_nuc$center,20))
719
		u_reads = sort(unique(reads))
720
		strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
721
		counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
722
		shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
723
		u_reads = u_reads + shifts
724
		inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
725
		                         "V2" = u_reads,
726
														 "V3" = strands,
727
														 "V4" = counts), stringsAsFactors=FALSE)
728
		samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain="strain_ex", total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
729
	}
730
	print(aggregate_intra_strain_nucs(samples))
731
})
732

    
733
flat_aggregated_intra_strain_nucs = function(# to flat aggregate_intra_strain_nucs function output
734
### This function builds a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
735
partial_strain_maps, ##<< the output of aggregate_intra_strain_nucs function
736
cur_index, ##<< the index of the roi involved
737
nb_tracks=3 ##<< the number of replicates
738
) {
739
	if  (length(partial_strain_maps) == 0 ){
740
		print(paste("Empty partial_strain_maps for roi", cur_index, "ands current strain." ))
741
    tmp_strain_maps = list()
742
	} else {
743
		tmp_strain_map = apply(t(1:length(partial_strain_maps)), 2, function(i){
744
			tmp_nuc = partial_strain_maps[[i]]
745
			tmp_nuc_as_list = list()
746
			tmp_nuc_as_list[["chr"]] = tmp_nuc[["chr"]]
747
			tmp_nuc_as_list[["lower_bound"]] = ceiling(tmp_nuc[["lower_bound"]])
748
			tmp_nuc_as_list[["upper_bound"]] = floor(tmp_nuc[["upper_bound"]])
749
			tmp_nuc_as_list[["cur_index"]] = cur_index
750
			tmp_nuc_as_list[["index_nuc"]] = i
751
			tmp_nuc_as_list[["wp"]] = as.integer(tmp_nuc$wp)
752
			all_original_reads = c()
753
			for (j in 1:length(tmp_nuc$nucs)) {
754
				all_original_reads = c(all_original_reads, tmp_nuc$nucs[[j]]$original_reads)
755
			}
756
			tmp_nuc_as_list[["nb_reads"]] = length(all_original_reads)
757
			tmp_nuc_as_list[["nb_nucs"]] = length(tmp_nuc$nucs)
758
			if (tmp_nuc$wp) {
759
        for (i in 1:(nb_tracks-1)) {
760
  				tmp_nuc_as_list[[paste("llr", i, sep="_")]] = signif(tmp_nuc$nucs[[i + 1]]$llr_score,5)          
761
        }
762
			} else {
763
        for (i in 1:(nb_tracks-1)) {
764
  				tmp_nuc_as_list[[paste("llr", i, sep="_")]] = NA
765
        }
766
			}
767
      return(tmp_nuc_as_list)
768
    })
769
    tmp_strain_maps = do.call("rbind", tmp_strain_map)
770
	}
771
  return(data.frame(lapply(data.frame(tmp_strain_maps, stringsAsFactors=FALSE), unlist), stringsAsFactors=FALSE))
772
### Returns a dataframe of all clusters obtain from aggregate_intra_strain_nucs function.
773
}
774

    
775
align_inter_strain_nucs = structure(function(# Aligns nucleosomes between 2 strains.
776
### This function aligns nucleosomes between two strains for a given genome region.
777
replicates, ##<< Set of replicates, ideally 3 per strain.
778
wp_nucs_strain_ref1=NULL, ##<< List of aggregates nucleosome for strain 1. If it's NULL this list will be computed.
779
wp_nucs_strain_ref2=NULL, ##<< List of aggregates nucleosome for strain 2. If it's NULL this list will be computed.
780
corr_thres=0.5, ##<< Correlation threshold.
781
llr_thres=100, ##<< Log likelihood ratio threshold to decide between merging and separating
782
config=NULL, ##<< GLOBAL config variable
783
... ##<< A list of parameters that will be passed to \emph{aggregate_intra_strain_nucs} if needed.
784
) {
785
	if (length(replicates) < 2) {
786
		stop("ERROR, align_inter_strain_nucs needs 2 replicate sets.")
787
	} else if (length(replicates) > 2) {
788
		print("WARNING, align_inter_strain_nucs will use 2 first sets of replicates as inputs.")
789
	}
790
	common_nuc = NULL
791
	llr_scores = c()
792
	chr = replicates[[1]][[1]]$roi$chr
793
  min_nuc_center = min(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
794
	max_nuc_center = max(replicates[[1]][[1]]$roi$begin, replicates[[1]][[1]]$roi$end)
795
	strain_ref1 = replicates[[1]][[1]]$strain
796
	strain_ref2 = replicates[[2]][[1]]$strain
797
	big_cur = replicates[[1]][[1]]$roi
798
  orig_big_cur = replicates[[1]][[1]]$orig_roi
799
	if(big_cur$end - big_cur$begin < 0) {
800
		tmp_begin = big_cur$begin
801
		big_cur$begin =  big_cur$end
802
		big_cur$end =  tmp_begin
803
	}
804
	# GO!
805
	if (is.null(wp_nucs_strain_ref1)) {
806
		wp_nucs_strain_ref1 = aggregate_intra_strain_nucs(replicates[[1]], ...)[[1]]
807
	}
808
	if (is.null(wp_nucs_strain_ref2)) {
809
	  wp_nucs_strain_ref2 = aggregate_intra_strain_nucs(replicates[[2]], ...)[[1]]
810
  }
811
	lws = c()
812
	ups = c()
813
	for (na in wp_nucs_strain_ref2) {
814
		lws = c(lws, na$lower_bound)
815
		ups = c(ups, na$upper_bound)
816
	}
817

    
818
	print(paste("Exploring chr" , chr , ", " , length(wp_nucs_strain_ref1) , ", [" , min_nuc_center , ", " , max_nuc_center , "] nucs...", sep=""))
819
	roi_strain_ref1 = NULL
820
	roi_strain_ref2 = NULL
821
	if (length(wp_nucs_strain_ref1) > 0) {
822
		for(index_nuc_strain_ref1 in 1:length(wp_nucs_strain_ref1)){
823
			# print(paste("" , index_nuc_strain_ref1 , "/" , length(wp_nucs_strain_ref1), sep=""))
824
			nuc_strain_ref1 = wp_nucs_strain_ref1[[index_nuc_strain_ref1]]
825
			# Filtering on Well Positionned
826
			if (nuc_strain_ref1$wp) {
827
				roi_strain_ref1 = list(name=paste("strain_chr_id_" , strain_ref1 , "_" , chr , "_" , "i" , "_", sep=""), begin=nuc_strain_ref1$lower_bound, end=nuc_strain_ref1$upper_bound, chr=chr, strain_ref = strain_ref1)
828
				roi_strain_ref2 = translate_cur(roi_strain_ref1, strain_ref2, big_cur=orig_big_cur, config=config)
829
        if (!is.null(roi_strain_ref2)){
830
					# LOADING INTRA_STRAIN_NUCS_FILENAME_STRAIN_REF2 FILE(S) TO COMPUTE MATCHING_NAS (FILTER)
831
					lower_bound_roi_strain_ref2 = min(roi_strain_ref2$end,roi_strain_ref2$begin)
832
					upper_bound_roi_strain_ref2 = max(roi_strain_ref2$end,roi_strain_ref2$begin)
833
					matching_nas = which( lower_bound_roi_strain_ref2 <= ups & lws <= upper_bound_roi_strain_ref2)
834
					for (index_nuc_strain_ref2 in matching_nas) {
835
						nuc_strain_ref2 = wp_nucs_strain_ref2[[index_nuc_strain_ref2]]
836
						# Filtering on Well Positionned
837
    				nuc_strain_ref2_to_roi = list(begin=nuc_strain_ref2$lower_bound, end=nuc_strain_ref2$upper_bound, chr=nuc_strain_ref2$chr, strain_ref = strain_ref2)
838
						if (!is.null(translate_cur(nuc_strain_ref2_to_roi, strain_ref1, big_cur=orig_big_cur, config=config)) &
839
                nuc_strain_ref2$wp) {
840
							# Filtering on correlation Score and collecting reads
841
							SKIP = FALSE
842
							# TODO: This for loop could be done before working on strain_ref2. Isn't it?
843
							reads_strain_ref1 = c()
844
							for (nuc in nuc_strain_ref1$nucs){
845
								reads_strain_ref1 = c(reads_strain_ref1, nuc$original_reads)
846
								if (nuc$corr < corr_thres) {
847
									SKIP = TRUE
848
								}
849
							}
850
							reads_strain_ref2 = c()
851
							for (nuc in nuc_strain_ref2$nucs){
852
								reads_strain_ref2 = c(reads_strain_ref2, nuc$original_reads)
853
								if (nuc$corr < corr_thres) {
854
									SKIP = TRUE
855
								}
856
							}
857
							# Filtering on correlation Score
858
							if (!SKIP) {
859
								# tranlation of reads into strain 2 coords
860
								diff = ((roi_strain_ref1$begin + roi_strain_ref1$end) - (roi_strain_ref2$begin + roi_strain_ref2$end)) / 2
861
								reads_strain_ref1 = reads_strain_ref1 - rep(diff, length(reads_strain_ref1))
862
								llr_score = llr_score_nvecs(list(reads_strain_ref1, reads_strain_ref2))
863
								llr_scores = c(llr_scores, llr_score)
864
								# Filtering on LLR Score
865
                if (llr_score < llr_thres) {
866
									tmp_nuc = list()
867
									# strain_ref1
868
									tmp_nuc[[paste("chr_", strain_ref1, sep="")]] = chr
869
									tmp_nuc[[paste("lower_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$lower_bound
870
									tmp_nuc[[paste("upper_bound_", strain_ref1, sep="")]] = nuc_strain_ref1$upper_bound
871
									tmp_nuc[[paste("mean_", strain_ref1, sep="")]] = signif(mean(reads_strain_ref1),5)
872
									tmp_nuc[[paste("sd_", strain_ref1, sep="")]] = signif(sd(reads_strain_ref1),5)
873
									tmp_nuc[[paste("nb_reads_", strain_ref1, sep="")]] = length(reads_strain_ref1)
874
									tmp_nuc[[paste("index_nuc_", strain_ref1, sep="")]] = index_nuc_strain_ref1
875
									# strain_ref2
876
									tmp_nuc[[paste("chr_", strain_ref2, sep="")]] = roi_strain_ref2$chr
877
									tmp_nuc[[paste("lower_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$lower_bound
878
									tmp_nuc[[paste("upper_bound_", strain_ref2, sep="")]] = nuc_strain_ref2$upper_bound
879
									tmp_nuc[[paste("means_", strain_ref2, sep="")]] = signif(mean(reads_strain_ref2),5)
880
									tmp_nuc[[paste("sd_", strain_ref2, sep="")]] = signif(sd(reads_strain_ref2),5)
881
									tmp_nuc[[paste("nb_reads_", strain_ref2, sep="")]] = length(reads_strain_ref2)
882
									tmp_nuc[[paste("index_nuc_", strain_ref2, sep="")]] = index_nuc_strain_ref2
883
									# common
884
									tmp_nuc[["llr_score"]] = signif(llr_score,5)
885
                  tmp_nuc[["common_wp_pval"]] = signif(1-pchisq(2*tmp_nuc[["llr_score"]], df=2),5)
886
									tmp_nuc[["diff"]] = abs(abs((roi_strain_ref2$begin - roi_strain_ref2$end)) / 2 - abs((nuc_strain_ref2$lower_bound - nuc_strain_ref2$upper_bound)) / 2)
887
									common_nuc = dfadd(common_nuc, tmp_nuc)
888
                }
889
							}
890
						}
891
					}
892
				} else {
893
		      print("WARNING! No roi for strain ref 2.")
894
			  }
895
		  }
896
		}
897

    
898
    if(length(unique(common_nuc[,1:3])[,1]) != length((common_nuc[,1:3])[,1])) {
899
      index_redundant = which(apply(common_nuc[,1:3][-length(common_nuc[,1]),] ==  common_nuc[,1:3][-1,] ,1,sum) == 3)
900
      to_remove_list = c()
901
      for (i in 1:length(index_redundant)) {
902
        if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
903
          to_remove = index_redundant[i]
904
        }   else {
905
          to_remove = index_redundant[i] + 1
906
        }
907
        to_remove_list = c(to_remove_list, to_remove)
908
      }
909
      common_nuc = common_nuc[-to_remove_list,]
910
    }
911

    
912
    if(length(unique(common_nuc[,8:10])[,1]) != length((common_nuc[,8:10])[,1])) {
913
      index_redundant = which(apply(common_nuc[,8:10][-length(common_nuc[,1]),] == common_nuc[,8:10][-1,] ,1,sum) == 3)
914
      to_remove_list = c()
915
      for (i in 1:length(index_redundant)) {
916
        if (common_nuc[index_redundant[i],15] < common_nuc[index_redundant[i]+1,15]) {
917
          to_remove = index_redundant[i]
918
        }   else {
919
          to_remove = index_redundant[i] + 1
920
        }
921
        to_remove_list = c(to_remove_list, to_remove)
922
      }
923
      common_nuc = common_nuc[-to_remove_list,]
924
    }
925

    
926
		return(list(common_nuc, llr_scores))
927
	} else {
928
		print("WARNING, no nucs for strain_ref1.")
929
		return(NULL)
930
	}
931
### Returns a list of clusterized nucleosomes, and all computed llr scores.
932
}, ex=function(){
933

    
934
    # Define new translate_cur function...
935
    translate_cur = function(roi, strain2, big_cur=NULL, config=NULL) {
936
      return(roi)
937
    }
938
    # Binding it by uncomment follwing lines.
939
    unlockBinding("translate_cur", as.environment("package:nucleominer"))
940
    unlockBinding("translate_cur", getNamespace("nucleominer"))
941
    assign("translate_cur", translate_cur, "package:nucleominer")
942
    assign("translate_cur", translate_cur, getNamespace("nucleominer"))
943
    lockBinding("translate_cur", getNamespace("nucleominer"))
944
    lockBinding("translate_cur", as.environment("package:nucleominer"))
945

    
946
	# Dealing with a region of interest
947
	roi =list(name="example", begin=1000,  end=1300, chr="1", genome=rep("A",301), strain_ref1 = "STRAINREF1")
948
	roi2 = translate_cur(roi, roi$strain_ref1)
949
	replicates = list()
950
	for (j in 1:2) {
951
		samples = list()
952
		for (i in 1:3) {
953
			# Create TF output
954
			tf_nuc = list("chr"=paste("chr", roi$chr, sep=""), "center"=(roi$end + roi$begin)/2, "width"= 150, "correlation.score"= 0.9)
955
			outputs = dfadd(NULL,tf_nuc)
956
			outputs = filter_tf_outputs(outputs, roi$chr, roi$begin, roi$end)
957
			# Generate corresponding reads
958
			nb_reads = round(runif(1,170,230))
959
			reads = round(rnorm(nb_reads, tf_nuc$center,20))
960
			u_reads = sort(unique(reads))
961
			strands = sample(c(rep("R",ceiling(length(u_reads)/2)),rep("F",floor(length(u_reads)/2))))
962
			counts = apply(t(u_reads), 2, function(r) { sum(reads == r)})
963
			shifts = apply(t(strands), 2, function(s) { if (s == "F") return(-tf_nuc$width/2) else return(tf_nuc$width/2)})
964
			u_reads = u_reads + shifts
965
			inputs = data.frame(list("V1" = rep(roi$chr, length(u_reads)),
966
			                         "V2" = u_reads,
967
															 "V3" = strands,
968
															 "V4" = counts), stringsAsFactors=FALSE)
969
			samples[[length(samples) + 1]] = list(id=1, marker="Mnase_Seq", strain=paste("strain_ex",j,sep=""), total_reads = 10000000, roi=roi, inputs=inputs, outputs=outputs)
970
		}
971
		replicates[[length(replicates) + 1]] = samples
972
	}
973
	print(align_inter_strain_nucs(replicates))
974
})
975

    
976

    
977

    
978

    
979

    
980

    
981

    
982

    
983

    
984

    
985

    
986

    
987

    
988

    
989

    
990

    
991

    
992

    
993

    
994

    
995

    
996

    
997

    
998
mread.table = 
999
### memoised version of read.table
1000
memoise(
1001
  function(file, ...){
1002
  print(paste("Memoising ", file, "...", sep=""))
1003
  read.table(file, ...)
1004
})
1005
mread.fasta = memoise(function( 
1006
file, ...){
1007
  print(paste("Memoising ", file, "...", sep=""))
1008
  read.fasta(file, ...)
1009
})
1010

    
1011

    
1012
fetch_mnase_replicates = function(# Prefetch data
1013
### Fetch and filter inputs and outpouts per region of interest. Organize it per replicates.
1014
strain, ##<< The strain we want mnase replicatesList of replicates. Each replicates is a vector of sample ids.
1015
roi, ##<< Region of interest.
1016
all_samples, ##<< Global list of samples.
1017
config=NULL, ##<< GLOBAL config variable
1018
only_fetch=FALSE, ##<< If TRUE, only fetch and not filtering. It is used tio load sample files into memory before forking.
1019
get_genome=FALSE, ##<< If TRUE, load corresponding genome sequence.
1020
get_ouputs=TRUE##<< If TRUE, get also ouput corresponding TF output files.
1021
) {
1022
	samples=list()
1023
  samples_ids = unique(all_samples[all_samples$marker == "Mnase_Seq" & all_samples$strain == strain,]$id)
1024
	for (i in samples_ids) {
1025
		sample = as.list(all_samples[all_samples$id==i,])
1026
    sample$orig_roi = roi
1027
    sample$roi = translate_cur(roi, sample$strain, config = config)
1028
		if (get_genome) {
1029
			# Get Genome
1030
      sample$roi$genome = mread.fasta(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]])[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1031
		}
1032
		# Get inputs
1033
    if ("tf_input" %in% names(all_samples)) {
1034
      sample_inputs_filename = all_samples$tf_input[all_samples$id==i]
1035
    } else {
1036
      sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1037
    }
1038
		sample$inputs = mread.table(sample_inputs_filename, stringsAsFactors=FALSE)
1039
		sample$total_reads = sum(sample$inputs[,4])
1040
		if (!only_fetch) {
1041
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1042
	  }
1043
	  # Get TF outputs for Mnase_Seq samples
1044
		if (sample$marker == "Mnase_Seq" & get_ouputs) {
1045
      if ("tf_output" %in% names(all_samples)) {
1046
        sample_outputs_filename = all_samples$tf_output[all_samples$id==i]
1047
      } else {
1048
        sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")                
1049
      }
1050
      sample$outputs = mread.table(sample_outputs_filename, header=TRUE, sep="\t")
1051
			if (!only_fetch) {
1052
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1053
  		}
1054
		}
1055
		samples[[length(samples) + 1]] = sample
1056
	}
1057
  return(samples)
1058
}
1059

    
1060
substract_region = function(# Substract to a list of regions an other list of regions that intersect it.
1061
### This fucntion embed a recursive part. It occurs when a substracted region split an original region on two.
1062
region1, ##<< Original regions.
1063
region2 ##<< Regions to substract.
1064
) {
1065
  rec_substract_region = function(region1, region2) {
1066
  non_inter_fuzzy = apply(t(1:length(region1[,1])), 2, function(i) {
1067
    cur_fuzzy = region1[i,]
1068
    inter_wp = region2[region2$lower_bound <= cur_fuzzy$upper_bound & region2$upper_bound >= cur_fuzzy$lower_bound,]
1069
    if (length(inter_wp[,1]) > 0) {
1070
      ret = c()
1071
      for (j in 1:length(inter_wp[,1])) {
1072
        cur_wp = inter_wp[j,]
1073
        if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
1074
          # remove cur_fuzzy
1075
          ret = c()
1076
          break
1077
        } else if (cur_wp$lower_bound <= cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
1078
          # crop fuzzy
1079
          cur_fuzzy$lower_bound = cur_wp$upper_bound + 1
1080
          ret = cur_fuzzy
1081
        } else if (cur_fuzzy$lower_bound < cur_wp$lower_bound & cur_fuzzy$upper_bound <= cur_wp$upper_bound) {
1082
          # crop fuzzy
1083
          cur_fuzzy$upper_bound = cur_wp$lower_bound - 1
1084
          ret = cur_fuzzy
1085
        } else if (cur_wp$lower_bound > cur_fuzzy$lower_bound & cur_wp$upper_bound < cur_fuzzy$upper_bound) {
1086
          # split fuzzy
1087
          tmp_ret_fuzzy_1 = cur_fuzzy
1088
          tmp_ret_fuzzy_1$upper_bound = cur_wp$lower_bound - 1
1089
          tmp_ret_fuzzy_2 = cur_fuzzy
1090
          tmp_ret_fuzzy_2$lower_bound = cur_wp$upper_bound + 1
1091
          ret = rec_substract_region(rbind(tmp_ret_fuzzy_1, tmp_ret_fuzzy_2), inter_wp)
1092
          # print(ret)
1093
          # ret = cur_fuzzy
1094
          break
1095
        } else {
1096
          stop("WARNING NO ADAPTED CASE!")
1097
        }
1098
      }
1099
      return(ret)
1100
    } else {
1101
      return(cur_fuzzy)
1102
    }
1103
  })
1104
  }
1105
  non_inter_fuzzy = rec_substract_region(region1[,1:4], region2[,1:4])
1106
  if (is.null(non_inter_fuzzy)) {return(non_inter_fuzzy)}
1107
  tmp_ulist = unlist(non_inter_fuzzy)
1108
  tmp_names = names(tmp_ulist)[1:4]
1109
  non_inter_fuzzy = data.frame(matrix(tmp_ulist, ncol=4, byrow=TRUE), stringsAsFactors=FALSE)
1110
  names(non_inter_fuzzy) = tmp_names
1111
  non_inter_fuzzy$chr = as.character(non_inter_fuzzy$chr)
1112
  non_inter_fuzzy$chr = as.numeric(non_inter_fuzzy$chr)
1113
  non_inter_fuzzy$lower_bound = as.numeric(non_inter_fuzzy$lower_bound)
1114
  non_inter_fuzzy$upper_bound = as.numeric(non_inter_fuzzy$upper_bound)
1115
  non_inter_fuzzy = non_inter_fuzzy[order(non_inter_fuzzy$lower_bound),]
1116
  return(non_inter_fuzzy)
1117
}
1118

    
1119
union_regions = function(# Aggregate regions that intersect themselves.
1120
### This function is based on sort of lower bounds to detect regions that intersect. We compare lower bound and upper bound of the porevious item. This function embed a while loop and break break regions list become stable.
1121
regions ##<< The Regions to be aggregated
1122
) {
1123
  if (is.null(regions)) {return(regions)}
1124
  if (nrow(regions) == 0) {return(regions)}
1125
  old_length = length(regions[,1])
1126
  new_length = 0
1127

    
1128
  while (old_length != new_length) {
1129
    regions = regions[order(regions$lower_bound), ]
1130
    regions$stop = !c(regions$lower_bound[-1] - regions$upper_bound[-length(regions$lower_bound)] <= 1, TRUE)
1131
    vec_end_1 = which(regions$stop)
1132
    if (length(vec_end_1) == 0) {
1133
      vec_end_1 = c(length(regions$stop))
1134
    }
1135
    if (vec_end_1[length(vec_end_1)] != length(regions$stop)) {
1136
      vec_end_1 = c(vec_end_1, length(regions$stop))
1137
    }
1138
    vec_beg_1 = c(1, vec_end_1[-length(vec_end_1)] + 1)
1139
    union = apply(t(1:length(vec_beg_1)), 2, function(i) {
1140
      chr = regions$chr[vec_beg_1[i]]
1141
      lower_bound = min(regions$lower_bound[vec_beg_1[i]:vec_end_1[i]])
1142
      upper_bound = max(regions$upper_bound[vec_beg_1[i]:vec_end_1[i]])
1143
      cur_index = regions$cur_index[vec_beg_1[i]]
1144
      data.frame(list(chr=chr, lower_bound=lower_bound, upper_bound=upper_bound, cur_index=cur_index))
1145
      })
1146
    union = collapse_regions(union)
1147
    old_length = length(regions[,1])
1148
    new_length = length(union[,1])
1149
    regions = union
1150
  }
1151
  return(union)
1152
}
1153

    
1154
# remove_aligned_wp = function(# Remove wp nucs from common nucs list.
1155
# ### It is based on common wp nucs index on nucs and region.
1156
# strain_maps, ##<< Nuc maps.
1157
# cur_index, ##<< The region of interest index.
1158
# tmp_common_nucs, ##<< the list of wp nucs.
1159
# strain##<< The strain to consider.
1160
# ){
1161
#   fuzzy_nucs = strain_maps[[strain]]
1162
#   fuzzy_nucs = fuzzy_nucs[fuzzy_nucs$cur_index == cur_index,]
1163
#   fuzzy_nucs = fuzzy_nucs[order(fuzzy_nucs$index_nuc),]
1164
#   if (length(fuzzy_nucs[,1]) == 0) {return(fuzzy_nucs)}
1165
#   if (sum(fuzzy_nucs$index_nuc == min(fuzzy_nucs$index_nuc):max(fuzzy_nucs$index_nuc)) != max(fuzzy_nucs$index_nuc)) {"Warning in index!"}
1166
#   anti_index_1 = tmp_common_nucs[[paste("index_nuc", strain, sep="_")]]
1167
#   fuzzy_nucs = fuzzy_nucs[-anti_index_1,]
1168
#   return(fuzzy_nucs)
1169
# }
1170

    
1171
translate_regions = function(# Translate a list of regions from a strain ref to another.
1172
### This function is an elaborated call to translate_cur.
1173
regions, ##<< Regions to be translated.
1174
combi, ##<< Combination of strains.
1175
cur_index, ##<< The region of interest index.
1176
config=NULL, ##<< GLOBAL config variable
1177
roi ##<< The region of interest.
1178
) {
1179
  tr_regions = apply(t(1:length(regions[,1])), 2, function(i) {
1180
    tmp_regions_ref2 = list(name="foo", begin=regions[i,]$lower_bound, end=regions[i,]$upper_bound, chr=as.character(regions[i,]$chr), strain_ref = combi[2])
1181
    big_cur =  roi
1182
    trs_tmp_regions_ref2 = translate_cur(tmp_regions_ref2, combi[1], config = config, big_cur = big_cur)
1183
    if (is.null(trs_tmp_regions_ref2)) {
1184
      return(NULL)
1185
    } else {
1186
      return(data.frame(list(chr=trs_tmp_regions_ref2$chr, lower_bound=min(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), upper_bound=max(trs_tmp_regions_ref2$begin, trs_tmp_regions_ref2$end), cur_index=cur_index)))
1187
    }
1188
  })
1189

    
1190
  return(collapse_regions(tr_regions))
1191
}
1192

    
1193
collapse_regions = function(# reformat an "apply  manipulated" list of regions
1194
### Utils to reformat an "apply  manipulated" list of regions
1195
regions ##< a list of regions
1196
) {
1197
  if (is.null(regions)) {
1198
    return(NULL)
1199
  } else {
1200
    regions = do.call(rbind, regions)
1201
    regions$chr = as.character(regions$chr)
1202
    regions$chr = as.numeric(regions$chr)
1203
    regions$lower_bound = as.numeric(regions$lower_bound)
1204
    regions$upper_bound = as.numeric(regions$upper_bound)
1205
    regions = regions[order(regions$lower_bound),]
1206
    return(regions)
1207
  }
1208
}
1209

    
1210

    
1211
crop_fuzzy = function(# Crop bound of regions according to region of interest bound
1212
### The fucntion is no more necessary since we remove "big_cur" bug in translate_cur function.
1213
tmp_fuzzy_nucs, ##<< the regiuons to be croped.
1214
roi, ##<< The region of interest.
1215
strain, ##<< The strain to consider.
1216
config=NULL ##<< GLOBAL config variable
1217
) {
1218
  tr_roi = translate_cur(roi, strain, config = config)
1219
  tr_roi_begin = min(tr_roi$begin, tr_roi$end)
1220
  tr_roi_end = max(tr_roi$begin, tr_roi$end)
1221
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,1]) > 0) {
1222
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound < tr_roi_begin,]$lower_bound = tr_roi_begin
1223
  }
1224
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,1]) > 0) {
1225
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound < tr_roi_begin,]$upper_bound = tr_roi_begin
1226
  }
1227
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,1]) > 0) {
1228
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$lower_bound > tr_roi_end,]$lower_bound = tr_roi_end
1229
  }
1230
  if (length(tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,1]) > 0) {
1231
    tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound > tr_roi_end,]$upper_bound = tr_roi_end
1232
  }
1233
  tmp_fuzzy_nucs = tmp_fuzzy_nucs[tmp_fuzzy_nucs$upper_bound != tmp_fuzzy_nucs$lower_bound,]
1234
  return(tmp_fuzzy_nucs)
1235
}
1236

    
1237
get_all_reads = function(# Retrieve Reads
1238
### Retrieve reads for a given marker, combi, form.
1239
marker, ##<< The marker to considere.
1240
combi, ##<< The starin combination to considere.
1241
form="wp", ##<< The nuc form to considere.
1242
config=NULL ##<< GLOBAL config variable
1243
) {
1244
	all_reads = NULL
1245
  for (manip in c("Mnase_Seq", marker)) {
1246
    if (form == "unr") {
1247
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="")
1248
  		tmp_res = mread.table(file=out_filename, header=TRUE)
1249
			tmp_res = tmp_res[tmp_res[,3] - tmp_res[,2] > 75,]
1250
      tmp_res$form = form
1251
    } else if (form == "wp") {
1252
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
1253
  		tmp_res = mread.table(file=out_filename, header=TRUE)
1254
      tmp_res$form = form
1255
    } else if (form == "wpunr") {
1256
		 	out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_wp_and_nbreads.tab",sep="")
1257
  		tmp_res = mread.table(file=out_filename, header=TRUE)
1258
      tmp_res$form = "wp"
1259
		  out_filename = paste(config$RESULTS_DIR, "/",combi[1],"_",combi[2],"_",manip,"_unr_and_nbreads.tab",sep="")
1260
  		tmp_res2 = mread.table(file=out_filename, header=TRUE)
1261
			tmp_res2 = tmp_res2[tmp_res2[,3] - tmp_res2[,2] > 75,]
1262
      tmp_res2$form = "unr"
1263
      tmp_res = rbind(tmp_res, tmp_res2)
1264
    }
1265
		if (is.null(all_reads)) {
1266
			all_reads = tmp_res[,c(1:9,length(tmp_res))]
1267
		}
1268
		tmp_res = tmp_res[,-c(1:9,length(tmp_res))]
1269
		all_reads = cbind(all_reads, tmp_res)
1270
  }
1271
  return(all_reads)
1272
}
1273

    
1274
get_design = function(# Build the design for DESeq
1275
### This function build the design according sample properties.
1276
marker, ##<< The marker to considere.
1277
combi, ##<< The starin combination to considere.
1278
all_samples ##<< Global list of samples.
1279
) {
1280
  off1 = 0
1281
  off2 = 0
1282
	manips = c("Mnase_Seq", marker)
1283
	design_rownames = c()
1284
	design_manip = c()
1285
	design_strain = c()
1286
  off2index = function(off) {
1287
  	switch(toString(off),
1288
  		"1"=c(0,1,1),
1289
  	  "2"=c(1,0,1),
1290
    	"3"=c(1,1,0),
1291
  		c(1,1,1)
1292
  		)
1293
  }
1294
	for (manip in manips) {
1295
		tmp_samples = all_samples[ ((all_samples$strain == combi[1] | all_samples$strain == combi[2]) &  all_samples$marker == manip), ]
1296
		tmp_samples = tmp_samples[order(tmp_samples$strain), ]
1297
		if (manip == "H3K4me1" & (off1 != 0 & off2 ==0 )) {
1298
			tmp_samples = tmp_samples[c(off2index(off1), c(1,1)) == 1,]
1299
		} else {
1300
			if (manip != "Mnase_Seq" & (off1 != 0 | off2 !=0)) {
1301
				tmp_samples = tmp_samples[c(off2index(off1), off2index(off2)) == 1,]
1302
			}
1303
		}
1304
		design_manip = c(design_manip, rep(manip, length(tmp_samples$id)))
1305
		for (strain in combi) {
1306
			cols = apply(t(tmp_samples[ (tmp_samples$strain == strain &  tmp_samples$marker == manip), ]$id), 2, function(i){paste(strain, manip, i, sep="_")})
1307
			design_strain = c(design_strain, rep(strain, length(cols)))
1308
			design_rownames = c(design_rownames, cols)
1309
		}
1310
	}
1311
	snep_design = data.frame( row.names=design_rownames, manip=design_manip, strain=design_strain)
1312
	return(snep_design)
1313
}
1314

    
1315
plot_dist_samples = function(# Plot the distribution of reads.
1316
### This fuxntion use the DESeq nomalization feature to compare qualitatively the distribution.
1317
strain, ##<< The strain to considere.
1318
marker, ##<< The marker to considere.
1319
res, ##<< Data
1320
all_samples, ##<< Global list of samples.
1321
NEWPLOT = TRUE ##<< If FALSE the curve will be add to the current plot.
1322
) {
1323
	cols = apply(t(all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id), 2, function(i){paste(strain, marker, i, sep="_")})
1324
	snepCountTable = res[,cols]
1325
	snepDesign = data.frame(
1326
		row.names = cols,
1327
		manip = rep(marker, length(cols)),
1328
		strain = rep(strain, length(cols))
1329
		)
1330
	cdsFull = newCountDataSet(snepCountTable, snepDesign)
1331
	sizeFactors = estimateSizeFactors(cdsFull)
1332
	# print(sizeFactors[[1]])
1333
	sample_ids = all_samples[ (all_samples$strain == strain &  all_samples$marker == marker), ]$id
1334
	if (NEWPLOT) {
1335
		plot(density(res[,paste(strain, marker, sample_ids[1], sep="_")] / sizeFactors[[1]][1]), col=0, main=paste(strain, marker))
1336
		NEWPLOT = FALSE
1337
	}
1338
	for (it in 1:length(sample_ids)) {
1339
		sample_id = sample_ids[it]
1340
		lines(density(res[,paste(strain, marker, sample_id, sep="_")] / sizeFactors[[1]][it]), col = it + 1, lty = it)
1341
	}
1342
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
1343
}
1344

    
1345

    
1346

    
1347

    
1348
analyse_count_table = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1349
### This function uses
1350
marker, ##<< The marker involved.
1351
combi, ##<< The strain combination involved.
1352
form, ##<< the nuc form involved.
1353
all_samples, ##<< Global list of samples.
1354
FDR = 0.0001, ## the specific False Discover Rate
1355
config=NULL ##<< GLOBAL config variable
1356
) {
1357
  # PRETREAT
1358
  snep_design = get_design(marker, combi, all_samples)
1359
  mnase_design = snep_design[snep_design$manip == "Mnase_Seq", ]
1360
  snep_reads = get_all_reads(marker, combi, form, config=config)
1361
	snep_count_table = snep_reads[, rownames(snep_design)]
1362
  mnase_count_table = snep_reads[, rownames(mnase_design)]
1363
  # RUN ANALYSE FOPR SNEPS
1364
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1365
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1366
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1367
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1368
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1369
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1370
  # RUN ANALYSE FOR MNASE
1371
  mnase_cdsFull = newCountDataSet(mnase_count_table, mnase_design$strain)
1372
  mnase_cdsFull1 = estimateDispersions(estimateSizeFactors(mnase_cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1373
  res_mnase = nbinomTest( mnase_cdsFull1, combi[1], combi[2])
1374
  # GLOBAL RESULTS
1375
  #   SNEPS
1376
  k = names(fit1)
1377
  snep_reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1378
  snep_reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1379
  snep_reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1380
	snep_reads$pvalsGLM = signif(pvalsGLM, 5)
1381
  # print(snep_design)
1382
	thres = FDR(snep_reads$pvalsGLM, FDR)
1383
	snep_reads$snep_index = snep_reads$pvalsGLM < thres
1384
  print(paste(sum(snep_reads$snep_index), " SNEPs found for ", length(snep_reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = ""))
1385
  #   MNASE
1386
  snep_reads[["mnase_l2fc"]] = signif(res_mnase$log2FoldChange, 5)
1387
  snep_reads[["mnase_l2fc_pval"]] = signif(res_mnase$pval, 5)
1388
  # write results
1389
	snep_filename = paste(config$RESULTS_DIR, "/" ,combi[1],"_",combi[2],"_",marker,"_", form, "_snep.tab",sep="")
1390
	write.table(snep_reads, file=snep_filename, row.names=FALSE, quote=FALSE)      
1391
  return(snep_reads)
1392
  },  ex=function(){
1393
    marker = "H3K4me1"
1394
    combi = c("BY", "YJM")
1395
    form = "wpunr" # "wp" | "unr" | "wpunr"
1396
    # foo = analyse_count_table(marker, combi, form)
1397
    # foo = analyse_count_table("H4K12ac", c("BY", "RM"), "wp")
1398
})
1399

    
1400

    
1401

    
1402

    
1403

    
1404

    
1405

    
1406

    
1407

    
1408

    
1409

    
1410

    
1411

    
1412

    
1413

    
1414

    
1415

    
1416

    
1417

    
1418

    
1419

    
1420

    
1421

    
1422

    
1423
ROM2ARAB = function(# Roman to Arabic pair list.
1424
### Utility to convert Roman numbers into Arabic numbers
1425
){list(
1426
  "I" = 1,
1427
  "II" = 2,
1428
  "III" = 3,
1429
  "IV" = 4,
1430
  "V" = 5,
1431
  "VI" = 6,
1432
  "VII" = 7,
1433
  "VIII" = 8,
1434
  "IX" = 9,
1435
  "X" = 10,
1436
  "XI" = 11,
1437
  "XII" = 12,
1438
  "XIII" = 13,
1439
  "XIV" = 14,
1440
  "XV" = 15,
1441
  "XVI" = 16,
1442
  "XVII" = 17,
1443
  "XVIII" = 18,
1444
  "XIX" = 19,
1445
  "XX" = 20
1446
)}
1447

    
1448
switch_pairlist = structure(function(# Switch a pairlist
1449
### Take a pairlist key:value and return the switched pairlist value:key.
1450
l ##<< The pairlist to switch.
1451
) {
1452
	ret = list()
1453
	for (name in names(l)) {
1454
		ret[[as.character(l[[name]])]] = name
1455
	}
1456
	ret
1457
### The switched pairlist.
1458
}, ex=function(){
1459
	l = list(key1 = "value1", key2 = "value2")
1460
	print(switch_pairlist(l))
1461
})
1462

    
1463
ARAB2ROM = function(# Arabic to Roman pair list.
1464
### Utility to convert Arabic numbers to Roman numbers
1465
){switch_pairlist(ROM2ARAB())}
1466

    
1467

    
1468

    
1469

    
1470
c2c_extraction = function(# Extract a sub part of the corresponding c2c file
1471
### This fonction allows to access to a specific part of the c2c file.
1472
strain1, ##<< the key strain
1473
strain2, ##<< the target strain
1474
chr=NULL, ##<< if defined, the c2c will be filtered according to the chromosome value
1475
lower_bound=NULL, ##<< if defined, the c2c will be filtered for part of the genome upper than lower_bound
1476
upper_bound=NULL, ##<< if defined, the c2c will be filtered for part of the genome lower than upper_bound
1477
config=NULL##<<  GLOBAL config variable
1478
) {
1479
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1480
	# Launch c2c file
1481
	if (reverse) {
1482
		c2c_filename = config$C2C_FILES[[paste(strain2, "-", strain1, sep="")]]
1483
	} else {
1484
		c2c_filename = config$C2C_FILES[[paste(strain1, "-", strain2, sep="")]]
1485
	}
1486
	c2c = mread.table(c2c_filename, stringsAsFactors=FALSE)
1487
  # Filtering unagapped
1488
  c2c = c2c[c2c$V6=="-",]
1489
	# Reverse
1490
	if (reverse) {
1491
		tmp_col = c2c$V1
1492
		c2c$V1 = c2c$V7
1493
		c2c$V7 = tmp_col
1494
		tmp_col = c2c$V2
1495
		c2c$V2 = c2c$V9
1496
		c2c$V9 = tmp_col
1497
		tmp_col = c2c$V3
1498
		c2c$V3 = c2c$V10
1499
		c2c$V10 = tmp_col
1500
	}
1501
  if (!is.null(chr)) {
1502
  	if (strain1 == "BY") {
1503
  		chro_1 = paste("chr", ARAB2ROM()[[chr]], sep="")
1504
  	} else if (strain1 == "RM") {
1505
  	  chro_1 = paste("supercontig_1.",chr,sep="")
1506
  	} else if (strain1 == "YJM") {
1507
  	  chro_1 = switch_pairlist(config$FASTA_INDEXES$YJM)[[chr]]
1508
  	}
1509
  	c2c = c2c[c2c$V1 == chro_1,]
1510
    if (!is.null(lower_bound)) {
1511
      if (length(c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V3 < lower_bound & c2c$V2 < c2c$V3,c("V2", "V3") ] = lower_bound}
1512
      if (length(c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V2 < lower_bound & c2c$V3 < c2c$V2,c("V2", "V3") ] = lower_bound}      
1513
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1514
    }
1515
    if (!is.null(upper_bound)) {
1516
      if (length(c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, 1] > 0)) {c2c[c2c$V2 > upper_bound & c2c$V2 < c2c$V3, c("V2", "V3")] = upper_bound}
1517
      if (length(c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, 1] > 0)) {c2c[c2c$V3 > upper_bound & c2c$V3 < c2c$V2, c("V2", "V3")] = upper_bound}      
1518
      c2c = c2c[c2c$V2 - c2c$V3 != 0,]
1519
    }
1520
  }
1521
  return(c2c)
1522
# It retruns the appropriate c2c file part.
1523
}
1524

    
1525
translate_cur = structure(function(# Translate coords of a genome region.
1526
### This function is used in the examples, usualy you have to define your own translation function and overwrite this one using \emph{unlockBinding} features. Please, refer to the example.
1527
roi, ##<< Original genome region of interest.
1528
strain2, ##<< The strain in wich you want the genome region of interest.
1529
config=NULL, ##<< GLOBAL config variable
1530
big_cur=NULL ##<< A largest region than roi use to filter c2c if it is needed.
1531
) {
1532
	strain1 = roi$strain_ref  
1533
  # Do something or nothing?
1534
  if (is.null(config$NEUTRAL_TRANSLATE_CUR)) {
1535
    config$NEUTRAL_TRANSLATE_CUR = FALSE
1536
  }
1537
	if (strain1 == strain2 | config$NEUTRAL_TRANSLATE_CUR) {
1538
    roi$strain_ref = strain2
1539
    roi$length = roi$end - roi$begin + sign(roi$end - roi$begin)
1540
		return(roi)
1541
	}
1542
  
1543
	# Extract c2c file
1544
	if (!is.null(big_cur)) {
1545
  	# Dealing with big_cur
1546
		if (roi$strain_ref != big_cur$strain_ref) {
1547
      big_cur = translate_cur(big_cur, roi$strain_ref, config=config)
1548
    }
1549
    if (big_cur$end < big_cur$begin) {
1550
      tmp_var = big_cur$begin
1551
      big_cur$begin = big_cur$end
1552
      big_cur$end = tmp_var
1553
      big_cur$length = big_cur$end - big_cur$begin + 1
1554
    }
1555
    if (big_cur$chr!=roi$chr | roi$end > big_cur$end | roi$end < big_cur$begin | roi$begin > big_cur$end | roi$begin < big_cur$begin) {
1556
      print("WARNING! Trying to translate a roi not included in a big_cur.")
1557
      return(NULL)
1558
    }
1559
  	c2c = c2c_extraction(strain1, strain2, big_cur$chr, big_cur$begin, big_cur$end, config=config)
1560
  } else {
1561
    # No big_cur
1562
  	c2c = c2c_extraction(strain1, strain2, roi$chr, config=config)    
1563
  }
1564
  
1565
  #	Convert initial roi$chr into c2c format
1566
  reverse = (strain1=="RM" & strain2=="BY") | strain1=="YJM"
1567
	begin_1 = roi$begin
1568
  end_1 = roi$end
1569
  if (reverse) {
1570
  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1571
    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1572
	} else {
1573
		tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1574
	  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1575
	}
1576

    
1577
	# Never happend conditions ...
1578
	{
1579
		if (length(tmptransfostart$V8) == 0) {
1580
			# begin_1 is between to lines: shift begin_1 to the start of 2nd line.
1581
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1582
  			tmp_c2c = c2c[c2c$V2>=begin_1,]
1583
  			begin_1 = min(tmp_c2c$V2)
1584
      } else {
1585
  			tmp_c2c = c2c[c2c$V3>=begin_1,]
1586
  			begin_1 = min(tmp_c2c$V3)
1587
      }
1588
			if (reverse) {
1589
		  	tmptransfostart = c2c[(c2c$V3>=begin_1 & c2c$V2<=begin_1 & c2c$V8==1) | (c2c$V2>=begin_1 & c2c$V3<=begin_1 & c2c$V8==-1),]
1590
			} else {
1591
				tmptransfostart = c2c[c2c$V3>=begin_1 & c2c$V2<=begin_1,]
1592
			}
1593
			if (length(tmptransfostart$V8) == 0) {
1594
				if (!is.null(big_cur)) {
1595
					return(NULL)
1596
					tmptransfostart = c2c[c2c$V3>=big_cur$begin & c2c$V2<=big_cur$begin,]
1597
				} else {
1598
					print(tmptransfostart)
1599
					print(tmptransfostop)
1600
					stop("Never happend condition 1.")
1601
				}
1602
			}
1603
		}
1604
		if (length(tmptransfostop$V8) == 0) {
1605
			# end_1 is between to lines: shift end_1 to the end of 2nd line.
1606
      if (sum(c2c$V3 >= c2c$V2) != 0) {
1607
  			tmp_c2c = c2c[c2c$V3<=end_1,]
1608
  			end_1 = max(tmp_c2c$V3)
1609
      } else {
1610
  			tmp_c2c = c2c[c2c$V2<=end_1,]
1611
  			end_1 = max(tmp_c2c$V2)
1612
      }
1613
			if (reverse) {
1614
		    tmptransfostop = c2c[(c2c$V3>=end_1   & c2c$V2<=end_1   & c2c$V8==1) | (c2c$V2>=end_1   & c2c$V3<=end_1   & c2c$V8==-1),]
1615
			} else {
1616
			  tmptransfostop = c2c[c2c$V3>=end_1 & c2c$V2<=end_1,]
1617
			}
1618
			if (length(tmptransfostop$V8) == 0) {
1619
				if (!is.null(big_cur)) {
1620
					return(NULL)
1621
				  tmptransfostop = c2c[c2c$V3>=big_cur$end & c2c$V2<=big_cur$end,]
1622
				} else {
1623
					print(tmptransfostart)
1624
					print(tmptransfostop)
1625
					stop("Never happend condition 2.")
1626
				}
1627
			}
1628
		}
1629
		if (length(tmptransfostart$V8) != 1) {
1630
			# print("many start")
1631
			tmptransfostart = tmptransfostart[tmptransfostart$V3>=begin_1 & tmptransfostart$V2==begin_1,]
1632
			if (length(tmptransfostart$V8) != 1) {
1633
				print(tmptransfostart)
1634
				print(tmptransfostop)
1635
  			stop("Never happend condition 3.")
1636
			}
1637
		}
1638
		if (length(tmptransfostop$V8) != 1) {
1639
			# print("many stop")
1640
		  tmptransfostop = tmptransfostop[tmptransfostop$V3==end_1 & tmptransfostop$V2<=end_1,]
1641
			if (length(tmptransfostop$V8) != 1) {
1642
				print(tmptransfostart)
1643
				print(tmptransfostop)
1644
  			stop("Never happend condition 4.")
1645
			}
1646
		}
1647
		if (tmptransfostart$V7 != tmptransfostop$V7) {
1648
			print(tmptransfostart)
1649
			print(tmptransfostop)
1650
 			stop("Problem with genome region of interest of strain 1. \nIt is translated over many contigs into strain 2 ref. \nSorry, but you have to redefine your region of interest.")
1651
		}
1652
	}
1653
  
1654
  # Deal with strand
1655
  if (tmptransfostart$V8 == 1) {
1656
    begin_2 = tmptransfostart$V9 + (begin_1 - tmptransfostart$V2)
1657
    end_2 = tmptransfostop$V9 + (end_1 - tmptransfostop$V2)
1658
  } else {
1659
    begin_2 = tmptransfostart$V9 - (begin_1 - tmptransfostart$V2)
1660
    end_2 = tmptransfostop$V9 - (end_1 - tmptransfostop$V2)
1661
  }
1662
	# Build returned roi
1663
	roi$strain_ref = strain2
1664
	if (roi$strain_ref == "BY") {
1665
		roi$chr = ROM2ARAB()[[substr(tmptransfostart$V7, 4, 12)]]
1666
	} else {
1667
		roi$chr = config$FASTA_INDEXES[[strain2]][[tmptransfostart$V7]]
1668
	}
1669
  roi$begin = begin_2
1670
  roi$end = end_2
1671
	if (sign(roi$end - roi$begin) == 0) {
1672
		roi$length = 1
1673
	} else {
1674
		roi$length = roi$end - roi$begin + sign(roi$end - roi$begin) * 1
1675
	}
1676
  return(roi)
1677
}, ex=function(){
1678
	# Define new translate_cur function...
1679
	translate_cur = function(roi, strain2, config) {
1680
		strain1 = roi$strain_ref
1681
		if (strain1 == strain2) {
1682
			return(roi)
1683
		} else {
1684
		  stop("Here is my new translate_cur function...")
1685
		}
1686
	}
1687
	# Binding it by uncomment follwing lines.
1688
	# unlockBinding("translate_cur", as.environment("package:nm"))
1689
	# unlockBinding("translate_cur", getNamespace("nm"))
1690
	# assign("translate_cur", translate_cur, "package:nm")
1691
	# assign("translate_cur", translate_cur, getNamespace("nm"))
1692
	# lockBinding("translate_cur", getNamespace("nm"))
1693
	# lockBinding("translate_cur", as.environment("package:nm"))
1694
})
1695

    
1696

    
1697
compute_curs = function (# Compute Common Uninterrupted Regions (CUR)
1698
### CURs are regions that can be aligned between the genomes
1699
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR
1700
min_cur_width = 4000, ##<< The minimum width of a CUR
1701
combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM")), ##<< list of strain than will be tested as uninterrupted regions
1702
config = NULL ##<< GLOBAL config variable
1703
) {
1704
  dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE)
1705
  check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) {
1706
    c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1707
    check_homogeneity(c2c)
1708
  	if (length(c2c[,1]) == 0 ) {
1709
      stop("WARNING! checking overlapping for a region corresponding to an empty c2c.")
1710
    } else {
1711
  		lower_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; min(l$V2, l$V3)})
1712
  		upper_bounds = apply(t(1:nrow(c2c)), 2,function(i){l = c2c[i,]; max(l$V2, l$V3)})
1713
  		tmp_index = order(lower_bounds)
1714
      lower_bounds = lower_bounds[tmp_index]
1715
      upper_bounds = upper_bounds[tmp_index]
1716
      tmp_diff = lower_bounds[-1] - upper_bounds[-length(upper_bounds)]
1717
      ov_index = which(tmp_diff < 0)
1718
      if(length(ov_index < 0) !=0 ) {
1719
        ov_index = ov_index[1]
1720
        print(paste("WARNING! Overlaping", " (", strain1, ",", strain2, ") chr: ", c2c[1,]$V1, sep=""))
1721
        c2c_corrupted = c2c[tmp_index,][c(ov_index, ov_index + 1),]
1722
        print(c2c_corrupted)
1723
        return(list(lower_bounds[ov_index+1] - 1, upper_bounds[ov_index] + 1))
1724
      }
1725
      return(NULL)
1726
    }
1727
  }
1728

    
1729
  check_homogeneity = function(sub_c2c) {
1730
    tmp_signs = sign(sub_c2c$V2 - sub_c2c$V3)
1731
    tmp_signs = tmp_signs[tmp_signs != 0]
1732
  	if (sum(tmp_signs[1]  != tmp_signs)) {
1733
  		print(paste("*************** ERROR, non homogenous region (sign)! ********************"))
1734
      print(tmp_signs)
1735
  	}
1736
    tmp_signs2 = sign(sub_c2c$V9 - sub_c2c$V10)
1737
    tmp_signs2 = tmp_signs2[tmp_signs2 != 0]
1738
  	if (sum(tmp_signs2[1]  != tmp_signs2)) {
1739
  		print(paste("*************** ERROR, non homogenous region (sign2)! ********************"))
1740
      print(tmp_signs2)
1741
  	}
1742
  	if (length(unique(sub_c2c[,c(1,7,8)])[,2]) != 1) {
1743
  		print("*************** ERROR, non homogenous region chrs or V8! ********************")
1744
  	}
1745
  }
1746

    
1747
  test_and_squeeze_rois = function(foo, config=NULL) {
1748
    is_it_ok = function(list1, list2) {
1749
      bar = cbind(list1$begin, list2$begin, abs(list1$begin - list2$begin), list1$end, list2$end, abs(list1$end - list2$end), list1$length, list2$length, abs(list1$length - list2$length))
1750
      ok = length(bar[bar[,3] != 0 | bar[,6] != 0, ]) == 0
1751
      if (!ok) {
1752
        print(bar[bar[,3] != 0 | bar[,6] != 0, ])
1753
      }
1754
      return (ok)
1755
    }
1756
    squeeze_rois = function(list1, list2) {
1757
      rois = apply(t(1:nrow(list1)), 2, function(i){
1758
        roi = list1[i,]
1759
        roi2 = list2[i,]
1760
        roi$begin = max(roi$begin, roi2$begin)
1761
        roi$end = min(roi$end, roi2$end)
1762
        roi$length =  roi$end - roi$begin + 1
1763
        return(roi)
1764
      })
1765
      return(do.call(rbind, rois))
1766
    }
1767
    # foo_orig = compute_curs2(config=config)
1768
    # foo = foo_orig
1769
    STOP = FALSE
1770
    nb_round = 0
1771
    while(!STOP) {
1772
      nb_round = nb_round + 1
1773
      print(paste("2-2 round #", nb_round, sep=""))
1774
      fooby = translate_curs(foo, "BY", config=config)
1775
      fooyjm = translate_curs(foo, "YJM", config=config)
1776
      fooyjmby = translate_curs(fooyjm, "BY", config=config)
1777
      if (!is_it_ok(fooby, fooyjmby)) {
1778
        print("case 1")
1779
        foo = squeeze_rois(fooby, fooyjmby)
1780
    		next
1781
      }
1782
      foorm = translate_curs(foo, "RM", config=config)
1783
      foormby = translate_curs(foorm, "BY", config=config)
1784
      if (!is_it_ok(fooby, foormby)) {
1785
        print("case 2")
1786
        foo = squeeze_rois(fooby, foormby)
1787
    		next
1788
      }
1789
      fooyjmrm = translate_curs(fooyjm, "RM", config=config)
1790
      fooyjmrmyjm = translate_curs(fooyjmrm, "YJM", config=config)
1791
      if (!is_it_ok(fooyjm, fooyjmrmyjm)) {
1792
        print("case 3")
1793
        foo = squeeze_rois(fooyjm, fooyjmrmyjm)
1794
        next
1795
      }
1796
      foormyjm = translate_curs(foorm, "YJM", config=config)
1797
      foormyjmrm = translate_curs(foormyjm, "RM", config=config)
1798
      if (!is_it_ok(foorm, foormyjmrm)) {
1799
        print("case 4")
1800
        foo = squeeze_rois(foorm, foormyjmrm)
1801
        next
1802
      }
1803
      foo = translate_curs(foo, "BY", config=config)
1804
      STOP = TRUE
1805
    }
1806
    STOP = FALSE
1807
    nb_round = 0
1808
    while(!STOP) {
1809
      nb_round = nb_round + 1
1810
      print(paste("3-3 round #", nb_round, sep=""))
1811
      fooby = translate_curs(foo, "BY", config=config)
1812
      foobyrm = translate_curs(fooby, "RM", config=config)
1813
      foobyrmyjm = translate_curs(foobyrm, "YJM", config=config)
1814
      foobyrmyjmby = translate_curs(foobyrmyjm, "BY", config=config)
1815
      if (!is_it_ok(fooby, foobyrmyjmby)) {
1816
        print("case 1")
1817
        foo = squeeze_rois(fooby, foobyrmyjmby)
1818
      }
1819
      fooby = translate_curs(foo, "BY", config=config)
1820
      foobyyjm = translate_curs(fooby, "YJM", config=config)
1821
      foobyyjmrm = translate_curs(foobyyjm, "RM", config=config)
1822
      foobyyjmrmby = translate_curs(foobyyjmrm, "BY", config=config)
1823
      if (!is_it_ok(fooby, foobyyjmrmby)) {
1824
        print("case 2")
1825
        foo = squeeze_rois(fooby, foobyyjmrmby)
1826
        next
1827
      }
1828
      foo = translate_curs(foo, "BY", config=config)
1829
      STOP = TRUE
1830
    }
1831
    print("end")
1832
    return(foo)
1833
  }
1834

    
1835
  get_inter_strain_rois = function(strain1, strain2, diff_allowed = 30, min_cur_width = 200, config=NULL) {
1836
    c2c = c2c_extraction(strain1, strain2, config=config)
1837
    # computing diffs
1838
    diff = c2c$V2[-1] - c2c$V3[-length(c2c$V2)]
1839
    diff2 = c2c$V9[-1] - c2c$V10[-length(c2c$V2)]
1840
    # Filtering
1841
  	indexes_stop = which(abs(diff) > diff_allowed | abs(diff2) > diff_allowed)
1842
  	indexes_start = c(1, indexes_stop[-length(indexes_stop)] + rep(1, length(indexes_stop) -1))
1843
    rois = apply(t(1:length(indexes_start)), 2, function(i) {
1844
      if ( i %% 20 == 1) print(paste(i, "/", length(indexes_start)))
1845
      returned_rois = NULL
1846
  		start = indexes_start[i]
1847
  		stop = indexes_stop[i]
1848
  		sub_c2c = c2c[start:stop,]
1849
      check_homogeneity(sub_c2c)
1850
  		if (strain1 == "BY") {
1851
  			chr = ROM2ARAB()[[substr(sub_c2c[1,]$V1,4,10)]]
1852
  		} else {
1853
  			chr = config$FASTA_INDEXES[[strain1]][[sub_c2c[1,]$V1]]
1854
  		}
1855
  		roi = list(chr=chr, begin=min(c(sub_c2c$V2,sub_c2c$V3)), end=max(c(sub_c2c$V2,sub_c2c$V3)), strain_ref=strain1)
1856
  		roi$length = roi$end - roi$begin + 1
1857
  		if (roi$length >= min_cur_width) {
1858
  			lower_bound = roi$begin
1859
  			upper_bound = roi$end
1860
        check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1861
        while(!is.null(check)) {
1862
          # print(check)
1863
      		roi1 = roi
1864
          roi1$end = check[[1]]
1865
      		roi1$length = roi1$end - roi1$begin + 1
1866
      		if (roi1$length >= min_cur_width) {
1867
            returned_rois = dfadd(returned_rois,roi1)
1868
          }
1869
          roi$begin = check[[2]]
1870
      		roi$length = roi$end - roi$begin + 1
1871
      		if (roi$length >= min_cur_width) {
1872
      			lower_bound = min(roi$begin, roi$end)
1873
      			upper_bound = max(roi$begin, roi$end)
1874
            check = check_overlaping(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1875
          } else {
1876
            check = NULL
1877
            roi = NULL
1878
          }
1879
        }
1880
        returned_rois = dfadd(returned_rois,roi)
1881
  	  }
1882
    })
1883
    rois = do.call(rbind,rois)
1884
    rois = rois[order(as.numeric(rois$chr), rois$begin), ]
1885
  	return(rois)
1886
  }
1887

    
1888
  translate_curs = function(rois, target_strain, config) {
1889
    tr_rois = apply(t(1:nrow(rois)), 2, function(i){
1890
      roi = rois[i,]
1891
      tr_roi = translate_cur(roi, target_strain, config=config)  
1892
      tmp_begin = min(tr_roi$begin, tr_roi$end)
1893
      tmp_end = max(tr_roi$begin, tr_roi$end)
1894
      tr_roi$begin = tmp_begin
1895
      tr_roi$end = tmp_end
1896
      tr_roi$length =  tr_roi$end - tr_roi$begin + 1
1897
      return(tr_roi)
1898
    })
1899
    tr_rois = do.call(rbind, tr_rois)
1900
    return(tr_rois)
1901
  }
1902

    
1903
  rois = list()
1904
  for (combi in combis) {
1905
    strain1 = combi[1]
1906
    strain2 = combi[2]
1907
    print(paste(strain1, strain2))
1908
    rois_fwd = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1909
    strain1 = combi[2]
1910
    strain2 = combi[1]
1911
    print(paste(strain1, strain2))
1912
    rois_rev = get_inter_strain_rois(strain1, strain2, min_cur_width = min_cur_width, diff_allowed = diff_allowed, config=config)
1913
    tr_rois_rev = translate_curs(rois_rev, combi[1], config)      
1914
    region1 = rois_fwd
1915
    region2 = tr_rois_rev
1916
    rois[[paste(combi[1], combi[2], sep="_")]] = intersect_region(rois_fwd, tr_rois_rev)
1917
  }
1918

    
1919
  if (length(combis)==1) {
1920
    curs = rois[[1]]
1921
  } else {
1922
    reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]])
1923
    reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ]
1924
    tr_reducted_1_rois = translate_curs(reducted_1_rois, "RM", config)  
1925
    reducted_2_rois = intersect_region(tr_reducted_1_rois, rois[["RM_YJM"]])
1926
    reducted_2_rois = reducted_2_rois[reducted_2_rois$length >= min_cur_width, ]
1927
    reducted_rois = translate_curs(reducted_2_rois, "BY", config)  
1928
    reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ]
1929
    squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config)
1930
    curs = squeezed_rois
1931
  }
1932
    
1933
  cur_filename = paste(config$RESULTS_DIR, "/all_curs.tab", sep="")
1934
  write.table(curs, file=cur_filename, row.names=FALSE, quote=FALSE)  
1935
  return(curs)
1936
}
1937

    
1938
intersect_region = function(# Returns the intersection of 2 list on regions.
1939
### This function...
1940
region1, ##<< Original regions.
1941
region2 ##<< Regions to intersect.
1942
) {
1943
  intersection = apply(t(1:nrow(region1)), 2, function(i) {
1944
    roi1 = region1[i, ]
1945
    sub_regions2 = region2[region2$chr == roi1$chr, ]
1946
    sub_regions2 = sub_regions2[roi1$begin <= sub_regions2$begin & sub_regions2$begin <= roi1$end |
1947
                                roi1$begin <= sub_regions2$end & sub_regions2$end <= roi1$end |
1948
                                sub_regions2$begin < roi1$begin  & roi1$end < sub_regions2$end
1949
                                , ]
1950
    if (nrow(sub_regions2) == 0) {
1951
      print("removing a region")
1952
      return(NULL)
1953
    } else if (nrow(sub_regions2) > 1) {
1954
      print("more than one region in intersect_region")          
1955
      return(do.call(rbind, apply(t(1:nrow(sub_regions2)), 2, function(i) {intersect_region(roi1, sub_regions2[i,])})))
1956
    } else {
1957
      roi2 = sub_regions2[1,]
1958
      if (roi1$begin < roi2$begin) {
1959
        print("not the same begin")
1960
        roi1$begin = roi2$begin
1961
        roi1$length =  roi1$end - roi1$begin + 1
1962
      }
1963
      if (roi1$end > roi2$end) {
1964
        print("not the same end")
1965
        roi1$end = roi2$end
1966
        roi1$length =  roi1$end - roi1$begin + 1
1967
      }
1968
      return(roi1)
1969
    }         
1970
  })
1971
  return(do.call(rbind,intersection))
1972
}
1973

    
1974
build_replicates = structure(function(# Stage replicates data
1975
### This function loads in memory the data corresponding to the given experiments.
1976
expe, ##<< a list of vectors corresponding to replicates.
1977
roi, ##<< the region that we are interested in.
1978
only_fetch=FALSE, ##<< filter or not inputs.
1979
get_genome=FALSE,##<< Load or not corresponding genome.
1980
all_samples, ##<< Global list of samples.
1981
config=NULL ##<< GLOBAL config variable.
1982
) {
1983
  build_samples = function(samples_ids, roi, only_fetch=FALSE, get_genome=TRUE, get_ouputs=TRUE, all_samples) {
1984
  	samples=list()
1985
  	for (i in samples_ids) {
1986
  		sample = as.list(all_samples[all_samples$id==i,])
1987
      sample$orig_roi = roi
1988
      sample$roi = translate_cur(roi, sample$strain, config = config)
1989
  		if (get_genome) {
1990
  			# Get Genome
1991
  			fasta_ref_filename = config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]]
1992
  			sample$roi$genome = mread.fasta(fasta_ref_filename, )[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1993
  		}
1994
  		# Get inputs
1995
      if ("tf_input" %in% names(all_samples)) {
1996
        sample_inputs_filename = all_samples$tf_input[all_samples$id==i]
1997
      } else {
1998
        sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1999
      }
2000
  		sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
2001
  		sample$inputs = mread.table(sample_inputs_filename, stringsAsFactors=FALSE)
2002
  		sample$total_reads = sum(sample$inputs[,4])
2003
  		if (!only_fetch) {
2004
  		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300, filter_for_coverage=TRUE)
2005
  	  }
2006
  	  # Get TF outputs for Mnase_Seq samples
2007
  		if (sample$marker == "Mnase_Seq" & get_ouputs) {
2008
        if ("tf_output" %in% names(all_samples)) {
2009
          sample_outputs_filename = all_samples$tf_output[all_samples$id==i]
2010
        } else {
2011
          sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")                
2012
        }
2013
  			sample$outputs = mread.table(sample_outputs_filename, header=TRUE, sep="\t")
2014
  			if (!only_fetch) {
2015
  	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
2016
    		}
2017
  		}
2018
  		samples[[length(samples) + 1]] = sample
2019
  	}
2020
  	return(samples)
2021
  }
2022
	replicates = list()
2023
	for(samples_ids in expe) {
2024
		samples = build_samples(samples_ids, roi, only_fetch=only_fetch, get_genome=get_genome, all_samples=all_samples)
2025
		replicates[[length(replicates) + 1]] = samples
2026
	}
2027
	return(replicates)
2028
  }, ex = function() {
2029
    # library(rjson)
2030
    # library(nucleominer)
2031
    #
2032
    # # Read config file
2033
    # json_conf_file = "nucleominer_config.json"
2034
    # config = fromJSON(paste(readLines(json_conf_file), collapse=""))
2035
    # # Read sample file
2036
    # all_samples = read.cvs(config$CSV_SAMPLE_FILE, sep=";", header=TRUE, stringsAsFactors=FALSE)
2037
    # # here are the sample ids in a list
2038
    # expes = list(c(1))
2039
    # # here is the region that we wnt to see the coverage
2040
    # cur = list(chr="8", begin=472000, end=474000, strain_ref="BY")
2041
    # # it displays the corverage
2042
    # replicates = build_replicates(expes, cur, all_samples=all_samples, config=config)
2043
    # out = watch_samples(replicates, config$READ_LENGTH,
2044
    #       plot_coverage = TRUE,
2045
    #       plot_squared_reads = FALSE,
2046
    #       plot_ref_genome = FALSE,
2047
    #       plot_arrow_raw_reads = FALSE,
2048
    #       plot_arrow_nuc_reads = FALSE,
2049
    #       plot_gaussian_reads = FALSE,
2050
    #       plot_gaussian_unified_reads = FALSE,
2051
    #       plot_ellipse_nucs = FALSE,
2052
    #       plot_wp_nucs = FALSE,
2053
    #       plot_wp_nuc_model = FALSE,
2054
    #       plot_common_nucs = FALSE,
2055
    #       height = 50)
2056
  })
2057

    
2058

    
2059

    
2060

    
2061

    
2062

    
2063

    
2064

    
2065

    
2066

    
2067

    
2068

    
2069

    
2070

    
2071

    
2072

    
2073

    
2074

    
2075

    
2076

    
2077

    
2078

    
2079

    
2080

    
2081

    
2082

    
2083

    
2084

    
2085

    
2086

    
2087

    
2088

    
2089

    
2090

    
2091

    
2092

    
2093

    
2094

    
2095

    
2096

    
2097

    
2098

    
2099

    
2100

    
2101

    
2102

    
2103

    
2104

    
2105

    
2106

    
2107

    
2108

    
2109

    
2110

    
2111

    
2112

    
2113

    
2114

    
2115

    
2116

    
2117

    
2118

    
2119

    
2120

    
2121

    
2122

    
2123

    
2124

    
2125

    
2126

    
2127

    
2128

    
2129

    
2130

    
2131

    
2132

    
2133

    
2134

    
2135

    
2136

    
2137

    
2138

    
2139

    
2140
watch_samples = function(# Watching analysis of samples
2141
### This function allows to view analysis for a particuler region of the genome.
2142
replicates, ##<< replicates under the form...
2143
read_length, ##<< length of the reads
2144
plot_ref_genome = TRUE, ##<< Plot (or not) reference genome.
2145
plot_arrow_raw_reads = TRUE,  ##<< Plot (or not) arrows for raw reads.
2146
plot_arrow_nuc_reads = TRUE,  ##<< Plot (or not) arrows for reads aasiocied to a nucleosome.
2147
plot_squared_reads = TRUE,  ##<< Plot (or not) reads in the square fashion.
2148
plot_coverage = FALSE,  ##<< Plot (or not) reads in the covergae fashion. fashion.
2149
plot_gaussian_reads = TRUE,  ##<< Plot (or not) gaussian model of a F anf R reads.
2150
plot_gaussian_unified_reads = TRUE,  ##<< Plot (or not) gaussian model of a nuc.
2151
plot_ellipse_nucs = TRUE,  ##<< Plot (or not) ellipse for a nuc.
2152
change_col = TRUE, ##<< Change the color of each nucleosome.
2153
plot_wp_nucs = TRUE,  ##<< Plot (or not) cluster of nucs
2154
plot_fuzzy_nucs = FALSE,  ##<< Plot (or not) cluster of fuzzy
2155
plot_wp_nuc_model = TRUE,  ##<< Plot (or not) gaussian model for a cluster of nucs
2156
plot_common_nucs = FALSE,  ##<< Plot (or not) aligned reads.
2157
plot_common_unrs = FALSE,  ##<< Plot (or not) unaligned nucleosomal refgions (UNRs).
2158
plot_wp_nucs_4_nonmnase = FALSE,  ##<< Plot (or not) clusters for non inputs samples.
2159
plot_chain = FALSE,  ##<< Plot (or not) clusterised nuceosomes between mnase samples.
2160
plot_sample_id = FALSE, ##<<  Plot (or not) the sample id for each sample.
2161
aggregated_intra_strain_nucs = NULL, ##<< list of aggregated intra strain nucs. If NULL, it will be computed.
2162
aligned_inter_strain_nucs = NULL, ##<< list of aligned inter strain nucs. If NULL, it will be computed.
2163
height = 10, ##<< Number of reads in per million read for each sample, graphical parametre for the y axis.
2164
main=NULL, ##<< main title of the produced plot
2165
xlab=NULL, ##<< xlab of the produced plot
2166
ylab="#reads (per million reads)", ##<< ylab of the produced plot
2167
config=NULL ##<< GLOBAL config variable
2168
){
2169
  returned_list = list()
2170
  # Computing global display parameters
2171
  if (replicates[[1]][[1]]$roi[["begin"]] < replicates[[1]][[1]]$roi[["end"]]) {
2172
	  x_min_glo = replicates[[1]][[1]]$roi[["begin"]]
2173
	  x_max_glo = replicates[[1]][[1]]$roi[["end"]]
2174
  } else {
2175
	  x_min_glo = - replicates[[1]][[1]]$roi[["begin"]]
2176
	  x_max_glo = - replicates[[1]][[1]]$roi[["end"]]
2177
  }
2178
	base_glo = 0
2179
	nb_rank_glo = 0
2180
  for (samples in replicates) {
2181
  	nb_rank_glo = nb_rank_glo + length(samples)
2182
  }
2183
	ylim_glo = c(base_glo, base_glo + height * nb_rank_glo)
2184
	y_min_glo = min(ylim_glo)
2185
	y_max_glo = max(ylim_glo)
2186
  delta_y_glo = y_max_glo - y_min_glo
2187
  # Plot main frame
2188
  if (is.null(xlab)) {
2189
    xlab = paste("Ref strain:", replicates[[1]][[1]]$strain, "chr: ", replicates[[1]][[1]]$roi$chr)
2190
  }
2191
  plot(c(x_min_glo,x_max_glo), c(0,0), ylim=ylim_glo, col=0, yaxt="n", ylab=ylab, xlab=xlab, main=main )
2192
  axis(2, at=0:(nb_rank_glo*2) * delta_y_glo / (nb_rank_glo*2), labels=c(rep(c(height/2,0),nb_rank_glo),height/2))
2193
  # Go
2194
	replicates_wp_nucs = list()
2195
  wp_maps = list()
2196
  fuzzy_maps = list()
2197
  for (replicate_rank in 1:length(replicates)) {
2198
		# Computing replicate parameters
2199
		nb_rank = length(samples)
2200
		base = (replicate_rank-1) * height * nb_rank
2201
		ylim = c(base, base + height * nb_rank)
2202
		y_min = min(ylim)
2203
		y_max = max(ylim)
2204
	  delta_y = y_max - y_min
2205
		samples = replicates[[replicate_rank]]
2206
		for (sample_rank in 1:length(samples)) {
2207
			# computing sample parameters
2208
			sample = samples[[sample_rank]]
2209
			y_lev = y_min + (sample_rank - 0.5) * delta_y/nb_rank
2210
      if (plot_sample_id) {
2211
  			text(x_min_glo, y_lev + height/2 - delta_y_glo/100, labels=paste("(",sample$id,") ",sample$strain, " ", sample$marker, sep=""))
2212
      }
2213

    
2214
		  if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2215
			  x_min = sample$roi[["begin"]]
2216
			  x_max = sample$roi[["end"]]
2217
		  } else {
2218
			  x_min = - sample$roi[["begin"]]
2219
			  x_max = - sample$roi[["end"]]
2220
		  }
2221
			shift = x_min_glo - x_min
2222
	    # Plot Genome seq
2223
			if (plot_ref_genome) {
2224
		  	text(1:length(sample$roi$genome) + x_min - 1 + shift, rep(y_lev - height/2, length(sample$roi$genome)), labels=toupper(sample$roi$genome), cex=dev.size()[1]*9/(x_max-x_min), family="Courier")
2225
		  }
2226
			# Plot reads
2227
			reads = sample$inputs
2228
			signs = sign_from_strand(reads[,3])
2229
			if (plot_arrow_raw_reads) {
2230
				arrows(sign(x_min) * reads[,2] + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (reads[,2] + signs * read_length) + shift, sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads + y_lev,
2231
				col=1, length=0.15/nb_rank)
2232
			}
2233
	    if (plot_squared_reads) {
2234
        # require(plotrix)
2235
				rect(sign(x_min) * reads[,2] + shift, rep(y_lev,length(reads[,1])), sign(x_min) * (reads[,2] + signs * read_length) + shift,  y_lev + sign(x_min) * signs * reads[,4] * 1000000/sample$total_reads, col=adjustcolor(1, alpha.f = 0.1),border=0)
2236
			}
2237
	    if (plot_coverage) {
2238
        if (length(reads[,1]) != 0) {
2239
          step_h = sign(x_min) * signs * reads[,4]
2240
          step_b = sign(x_min) * reads[,2] + shift
2241
          step_e = sign(x_min) * (reads[,2] + signs * 150) + shift
2242
          steps_x = min(step_b, step_e):max(step_b, step_e)
2243
          steps_y = rep(0, length(steps_x))
2244
          for (i in 1:length(step_h)) {
2245
            steps_y[which(steps_x==min(step_b[i], step_e[i]))] =  steps_y[which(steps_x==min(step_b[i], step_e[i]))] + abs(step_h[i])
2246
            steps_y[which(steps_x==max(step_b[i], step_e[i]))] =  steps_y[which(steps_x==max(step_b[i], step_e[i]))] - abs(step_h[i])
2247
          }
2248
          tmp_index = which(steps_y != 0)
2249
          steps_x = steps_x[tmp_index]
2250
          steps_y = steps_y[tmp_index]
2251
          tmp_current_level = 0
2252
          for (i in 1:length(steps_y)) {
2253
            steps_y[i] = tmp_current_level + steps_y[i]
2254
            tmp_current_level = steps_y[i]
2255
          }
2256
          steps_y = c(0, steps_y)
2257
          steps_y = steps_y * 1000000/sample$total_reads
2258
        } else {
2259
          steps_y = c(0, 0, 0)
2260
          steps_x = c(x_min, x_max)
2261
        }
2262
        # print(steps_x)
2263
        # print(steps_y)
2264
        lines(stepfun(steps_x, steps_y + y_lev), pch="")
2265
        abline(y_lev,0)
2266
        returned_list[[paste("cov", sample$id, sep="_")]] = stepfun(steps_x, steps_y)
2267
			}
2268
			# Plot nucs
2269
	    if (sample$marker == "Mnase_Seq" & (plot_squared_reads | plot_gaussian_reads | plot_gaussian_unified_reads | plot_arrow_nuc_reads)) {
2270
				nucs = sample$outputs
2271
				if (length(nucs$center) > 0) {
2272
					col = 1
2273
		      for (i in 1:length(nucs$center)) {
2274
            if (change_col) {
2275
  						col = col + 1
2276
              } else {
2277
                col = "blue"
2278
              }
2279
		        nuc = nucs[i,]
2280
						involved_reads = filter_tf_inputs(reads, sample$roi$chr, nuc$lower_bound, nuc$upper_bound, nuc_width = nuc$width)
2281
				  	involved_signs = apply(t(involved_reads[,3]), 2, function(strand) {	if (strand == "F") return(1) else return(-1)})
2282
						total_involved_reads = sum(involved_reads[,4])
2283
						if (plot_arrow_nuc_reads ) {
2284
							arrows(sign(x_min) * involved_reads[,2] + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev, sign(x_min) * (involved_reads[,2] + involved_signs * read_length) + shift, sign(x_min) * involved_signs * involved_reads[,4] * 1000000/sample$total_reads + y_lev,
2285
							col=col, length=0.15/nb_rank)
2286
						}
2287
	          if (plot_gaussian_reads | plot_gaussian_unified_reads) {
2288
  						flatted_reads = flat_reads(involved_reads, nuc$width)
2289
	  					delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
2290
		  			}
2291
	          if (plot_gaussian_reads ) {
2292
							flatted_reads = flat_reads(involved_reads, nuc$width)
2293
							delta_x = (nuc$center - nuc$width):(nuc$center + nuc$width)
2294
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[1]]), sd(flatted_reads[[1]])) * length(flatted_reads[[1]]) * sign(x_min) * height/5 + y_lev, col=col)
2295
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[2]]), sd(flatted_reads[[2]])) * length(flatted_reads[[2]]) * -1 * sign(x_min) * height/5 + y_lev, col=col)
2296
	          }
2297
	          if (plot_gaussian_unified_reads ) {
2298
							lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(flatted_reads[[3]]), sd(flatted_reads[[3]])) * length(flatted_reads[[3]]) * height/5 + y_lev, col=col, lty=1)
2299
	          }
2300
	          if (plot_ellipse_nucs) {
2301
				      # require(plotrix)
2302
	  	 				draw.ellipse(sign(x_min) * nuc$center + shift, y_lev, nuc$width/2, total_involved_reads/nuc$width * height/5, border=col)
2303
						}
2304
		      }
2305
		    } else {
2306
		      print("WARNING! No nucs to print.")
2307
		    }
2308
			}
2309
	  }
2310

    
2311
	  # Plot wp nucs
2312
		if ((plot_wp_nucs_4_nonmnase | sample$marker == "Mnase_Seq") & (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs | plot_chain)) {
2313
			if (samples[[1]]$marker == "Mnase_Seq") {
2314
				if (is.null(aggregated_intra_strain_nucs)) {
2315
	  			wp_nucs = aggregate_intra_strain_nucs(samples)[[1]]
2316
				} else {
2317
					wp_nucs = aggregated_intra_strain_nucs[[replicate_rank]]
2318
				}
2319
		  } else {
2320
  			wp_nucs = replicates_wp_nucs[[replicate_rank-2]]
2321
		  }
2322
      if (plot_chain) {
2323
        tf_nucs = lapply(wp_nucs, function(nuc) {
2324
          bar = apply(t(nuc$nucs), 2, function(tmp_nuc){
2325
            tmp_nuc = tmp_nuc[[1]]
2326
            tmp_nuc$inputs = NULL
2327
            tmp_nuc$original_reads = NULL
2328
            tmp_nuc$wp = nuc$wp
2329
            # print(tmp_nuc)
2330
            return(tmp_nuc)
2331
          })
2332
          return(do.call(rbind, bar))
2333
        })
2334
        tf_nucs = data.frame(do.call(rbind, tf_nucs))
2335
        tmp_x = (unlist(tf_nucs$lower_bound) + unlist(tf_nucs$upper_bound)) / 2
2336
        tmp_y =  y_min + (unlist(tf_nucs$track) - 0.5) * delta_y/nb_rank
2337
        tmp_y_prev = tmp_y[-length(tmp_y)]
2338
        tmp_y_next = tmp_y[-1]
2339
        tmp_y_inter = (tmp_y_prev + tmp_y_next) / 2
2340
        tmp_track = unlist(tf_nucs$track)
2341
        tmp_track_prev = tmp_track[-length(tmp_track)]
2342
        tmp_track_next = tmp_track[-1]
2343
        # tmp_track_inter = signif(tmp_track_prev - tmp_track_next) * (abs(tmp_track_prev - tmp_track_next) > 1) * 25
2344
        if (is.null(config$TRACK_LLR_OFFSET)) {
2345
          config$TRACK_LLR_OFFSET = 0
2346
        }
2347
        tmp_track_inter = signif(tmp_track_prev - tmp_track_next) + config$TRACK_LLR_OFFSET * 25
2348
        tmp_x_prev = tmp_x[-length(tmp_x)]
2349
        tmp_x_next = tmp_x[-1]
2350
        need_shift = apply(t(tmp_x_next - tmp_x_prev), 2, function(delta){ delta < 50})
2351
        tmp_x_inter = (tmp_x_prev + tmp_x_next) / 2 + tmp_track_inter * need_shift
2352
        tmp_llr_inter =signif(unlist(tf_nucs$llr_score)[-1], 2)
2353
        new_tmp_x = c()
2354
        new_tmp_y = c()
2355
        index_odd = 1:length(tmp_x) * 2 - 1
2356
        index_even = (1:(length(tmp_x) - 1)) * 2
2357
        new_tmp_x[index_odd] = tmp_x
2358
        new_tmp_y[index_odd] = tmp_y
2359
        new_tmp_x[index_even] = tmp_x_inter
2360
        new_tmp_y[index_even] = tmp_y_inter
2361
        lines(new_tmp_x , new_tmp_y, lwd=2)
2362
        points(tmp_x, tmp_y, cex=4, pch=16, col="white")
2363
        points(tmp_x, tmp_y, cex=4, lwd=2)
2364
        text(tmp_x, tmp_y, 1:nrow(tf_nucs))
2365
        if (is.null(config$LEGEND_LLR_POS)) {
2366
          pos = 2
2367
        } else {
2368
          pos = config$LEGEND_LLR_POS
2369
        }
2370
        col_llr = sapply(tmp_llr_inter, function(llr){if (llr < 20 ) return("green") else return("red")})
2371
        text(tmp_x_inter, tmp_y_inter, tmp_llr_inter, cex=1.5, pos=pos, col=col_llr) 
2372
        
2373
      }
2374

    
2375
      if (plot_wp_nucs | plot_fuzzy_nucs | plot_common_nucs ) {
2376
    		replicates_wp_nucs[[replicate_rank]] = wp_nucs
2377
        strain = samples[[1]]$strain
2378
        nb_tracks = length(replicates[[1]])
2379
        # print(paste("**************"), nb_tracks)
2380
        wp_maps[[strain]] = flat_aggregated_intra_strain_nucs(wp_nucs, "foo", nb_tracks)
2381
        fuzzy_maps[[strain]] = get_intra_strain_fuzzy(wp_maps[[strain]], as.list(samples[[1]]$roi), samples[[1]]$strain, config=config)
2382

    
2383
        if (plot_fuzzy_nucs) {
2384
          fuzzy_map = fuzzy_maps[[strain]]
2385
          if (!is.null(fuzzy_map)) {
2386
            if (nrow(fuzzy_map) > 0) {
2387
              rect(sign(x_min) * fuzzy_map$lower_bound + shift, y_min, sign(x_min) * fuzzy_map$upper_bound + shift, y_max, col=adjustcolor(3, alpha.f = 0.1), border=1)                    
2388
            }
2389
          }
2390
        } 
2391

    
2392
        if (plot_wp_nucs) {
2393
    			for (wp_nuc in wp_nucs) {
2394
    				if (wp_nuc$wp){
2395
    					rect(sign(x_min) * wp_nuc$lower_bound + shift, y_min, sign(x_min) * wp_nuc$upper_bound + shift, y_max, col=adjustcolor(2, alpha.f = 0.1), border=1)
2396
    					if (plot_wp_nuc_model) {
2397
      					all_original_reads = c()
2398
      					for(initial_nuc in wp_nuc$nucs) {
2399
      						all_original_reads = c(all_original_reads, initial_nuc$original_reads)
2400
      					}
2401
      					delta_x = wp_nuc$lower_bound:wp_nuc$upper_bound
2402
    					  lines(sign(x_min) * delta_x + shift, dnorm(delta_x, mean(all_original_reads), sd(all_original_reads)) * length(all_original_reads) * height/5 + y_min, col=1)
2403
    				  }
2404
  				  }
2405
  				}
2406
  			}
2407
      }
2408
		}
2409
	}
2410

    
2411
	if (plot_common_nucs) {
2412
    if (is.null(aligned_inter_strain_nucs)) {
2413
      aligned_inter_strain_nucs = align_inter_strain_nucs(replicates, replicates_wp_nucs[[1]], replicates_wp_nucs[[2]], config=config)[[1]]
2414
      if (!is.null(aligned_inter_strain_nucs)) {
2415
        aligned_inter_strain_nucs$cur_index = "foo" 
2416
      }
2417
    }
2418

    
2419
    #Plot common wp nucs
2420
    mid_y = shift = x_min = x_max = nb_rank = base = ylim = ymin = y_max = delta_y = list()
2421
    for (replicate_rank in 1:length(replicates)) {
2422
      nb_rank[[replicate_rank]] = length(samples)
2423
      base[[replicate_rank]] = (replicate_rank-1) * height * nb_rank[[replicate_rank]]
2424
      ylim[[replicate_rank]] = c(base[[replicate_rank]], base[[replicate_rank]] + height * nb_rank[[replicate_rank]])
2425
      y_min[[replicate_rank]] = min(ylim[[replicate_rank]])
2426
      y_max[[replicate_rank]] = max(ylim[[replicate_rank]])
2427
      delta_y[[replicate_rank]] = y_max[[replicate_rank]] - y_min[[replicate_rank]]
2428
      mid_y[[replicate_rank]] = (y_max[[replicate_rank]] + y_min[[replicate_rank]]) / 2
2429

    
2430
      samples = replicates[[replicate_rank]]
2431
      for (sample_rank in 1:length(samples)) {
2432
        sample = samples[[sample_rank]]
2433
        y_lev = y_min[[replicate_rank]] + (sample_rank - 0.5) * delta_y[[replicate_rank]]/nb_rank[[replicate_rank]]
2434
        if (sample$roi[["begin"]] < sample$roi[["end"]]) {
2435
          x_min[[replicate_rank]] = sample$roi[["begin"]]
2436
          x_max[[replicate_rank]] = sample$roi[["end"]]
2437
        } else {
2438
          x_min[[replicate_rank]] = - sample$roi[["begin"]]
2439
          x_max[[replicate_rank]] = - sample$roi[["end"]]
2440
        }
2441
        shift[[replicate_rank]] = x_min[[1]] - x_min[[replicate_rank]]
2442
      }
2443
    }
2444
    print(aligned_inter_strain_nucs)
2445
    if (!is.null(aligned_inter_strain_nucs)) {
2446
      for (inter_strain_nuc_index in 1:length(aligned_inter_strain_nucs[,1])) {
2447
        inter_strain_nuc = aligned_inter_strain_nucs[inter_strain_nuc_index,]
2448
        tmp_xs = tmp_ys = c()
2449
        for (replicate_rank in 1:length(replicates)) {
2450
          samples = replicates[[replicate_rank]]
2451
          strain = samples[[1]]$strain
2452
          tmp_xs = c(tmp_xs, sign(x_min[[replicate_rank]]) * (inter_strain_nuc[[paste("lower_bound_",strain,sep="")]] + inter_strain_nuc[[paste("upper_bound_",strain,sep="")]])/2 + shift[[replicate_rank]])
2453
          tmp_ys = c(tmp_ys, mid_y[[replicate_rank]])
2454
        }
2455
        lines(tmp_xs, tmp_ys, col=2, type="b", lwd=dev.size()[1]*100/(x_max[[1]]-x_min[[1]])*8, cex=dev.size()[1]*400/(x_max[[1]]-x_min[[1]]), pch=19)
2456
      }
2457
    }
2458
    
2459
    if (plot_common_unrs) {
2460
      combi = c(replicates[[1]][[1]]$strain, replicates[[2]][[1]]$strain)
2461
      roi = as.list(samples[[1]]$roi)
2462
      cur_index = "foo"
2463
      common_nuc_results = list()
2464
      common_nuc_results[[paste(combi[1], combi[2], sep="_")]] = aligned_inter_strain_nucs
2465
      unrs = get_unrs(combi, roi, cur_index, wp_maps, fuzzy_maps, common_nuc_results, config = config) 
2466
      rect(sign(x_min[[1]]) * unrs$lower_bound + shift[[1]], y_min[[1]], sign(x_min[[1]]) * unrs$upper_bound + shift[[1]], y_max[[2]], border=4, lwd=10, col=adjustcolor(4, alpha.f = 0.05))        
2467
    }
2468

    
2469
	}
2470
  return(returned_list)
2471
}
2472

    
2473

    
2474

    
2475
get_intra_strain_fuzzy = function(# Compute the fuzzy list for a given strain.
2476
### This function grabs the nucleosomes detxted by template_filter that have been rejected bt aggregate_intra_strain_nucs as well positions.
2477
wp_map, ##<< Well positionned nucleosomes map.
2478
roi, ##<< The region of interest.
2479
strain, ##<< The strain we want to extracvt the fuzzy map.
2480
config=NULL ##<< GLOBAL config variable.
2481
) {
2482
  fuzzy_map = wp_map[wp_map$wp==0, ]
2483
  if (nrow(fuzzy_map) > 0) {
2484
    fuzzy_map = substract_region(fuzzy_map, wp_map[wp_map$wp==1,])
2485
    if (!is.null(fuzzy_map)) {
2486
      fuzzy_map = union_regions(fuzzy_map)
2487
      fuzzy_map = crop_fuzzy(fuzzy_map, roi, strain, config)
2488
    }
2489
  }
2490
  return(fuzzy_map)
2491
}
2492

    
2493

    
2494

    
2495

    
2496

    
2497
get_unrs = function(# Compute the unaligned nucleosomal regions (UNRs).
2498
### This function aggregate non common wp nucs for each strain and substract common wp nucs. It does not take care about the size of the resulting UNR. It will be take into account in the count read part og the pipeline.
2499
combi, ##<< The strain combination to consider.
2500
roi, ##<< The region of interest.
2501
cur_index, ##<< The region of interest index.
2502
wp_maps, ##<< Well positionned nucleosomes maps.
2503
fuzzy_maps, ##<< Fuzzy nucleosomes maps.
2504
common_nuc_results, ##<< Common wp nuc maps
2505
config=NULL ##<< GLOBAL config variable
2506
) {
2507
  # print(cur_index)
2508

    
2509
  tmp_combi_key = paste(combi[1], combi[2], sep="_")
2510
  tmp_common_nucs = common_nuc_results[[tmp_combi_key]]
2511
  tmp_common_nucs = tmp_common_nucs[tmp_common_nucs$cur_index == cur_index, ]
2512

    
2513
  # print(paste("Dealing with unr from", combi[1]))
2514
  tmp_fuzzy = fuzzy_maps[[combi[1]]]
2515
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ]
2516
  tmp_wp = wp_maps[[combi[1]]]
2517
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2518
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2519
  # Let's go!
2520
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2521
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2522
      return(NULL)
2523
    } else {
2524
      return (index_nuc)
2525
    }
2526
  }))
2527
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2528
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2529
  if (length(tmp_unr) != 0) {
2530
    tmp_unr = union_regions(tmp_unr)
2531
  }
2532
  tmp_unr_nucs_1 = tmp_unr
2533
  if (length(tmp_unr_nucs_1[,1]) == 0) {return(NULL)}
2534
  agg_unr_1 = tmp_unr_nucs_1
2535

    
2536
  # print(paste("Dealing with unr from ", combi[2]))
2537
  tmp_fuzzy = fuzzy_maps[[combi[2]]]
2538
  tmp_fuzzy = tmp_fuzzy[tmp_fuzzy$cur_index == cur_index, ]
2539
  tmp_wp = wp_maps[[combi[2]]]
2540
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2541
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2542
  # Let's go!
2543
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2544
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2545
      return(NULL)
2546
    } else {
2547
      return (index_nuc)
2548
    }
2549
  }))
2550
  tmp_unaligned_wp = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2551
  tmp_unr = rbind(tmp_fuzzy,tmp_unaligned_wp[,1:4])
2552
  if (length(tmp_unr) != 0) {
2553
    tmp_unr = union_regions(tmp_unr)
2554
  }
2555
  tmp_unr_nucs_2 = tmp_unr
2556
  if (length(tmp_unr_nucs_2[,1]) == 0) {return(NULL)}
2557
  agg_unr_2 = crop_fuzzy(tmp_unr_nucs_2, roi, combi[2], config)
2558
  tr_agg_unr_2 = translate_regions(agg_unr_2, combi, cur_index, roi=roi, config=config)
2559
  tr_agg_unr_2 = union_regions(tr_agg_unr_2)
2560

    
2561
  # print("Dealing with unr from both...")
2562
  all_unr = union_regions(rbind(agg_unr_1, tr_agg_unr_2))
2563

    
2564
  # print(paste("Dealing with wp from", combi[1]))
2565
  tmp_wp = wp_maps[[combi[1]]]
2566
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2567
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2568
  # Let's go!
2569
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2570
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[1], sep="_")]]) {
2571
      return (index_nuc)
2572
    } else {
2573
      return(NULL)
2574
    }
2575
  }))
2576
  wp_nucs_1 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2577
  
2578
  # print(paste("Dealing with wp from", combi[2]))
2579
  tmp_wp = wp_maps[[combi[2]]]
2580
  tmp_wp = tmp_wp[tmp_wp$wp==1,]
2581
  tmp_wp = tmp_wp[tmp_wp$cur_index == cur_index, ]
2582
  # Let's go!
2583
  tmp_index = unlist(apply(t(tmp_wp$index_nuc), 2, function(index_nuc) {
2584
    if (index_nuc %in% tmp_common_nucs[[paste("index_nuc", combi[2], sep="_")]]) {
2585
      return (index_nuc)
2586
    } else {
2587
      return(NULL)
2588
    }
2589
  }))
2590
  wp_nucs_2 = tmp_wp[tmp_wp$index_nuc %in% tmp_index, ]
2591
  wp_nucs_2 = crop_fuzzy(wp_nucs_2, roi, combi[2], config)
2592
  if (nrow(wp_nucs_2) == 0) {
2593
    tr_wp_nucs_2 = wp_nucs_2
2594
  } else {
2595
    tr_wp_nucs_2 = translate_regions(wp_nucs_2, combi, cur_index, roi=roi, config=config)      
2596
  }
2597

    
2598
  # print("Dealing with wp from both...")
2599
  all_wp = union_regions(rbind(wp_nucs_1[,1:4], tr_wp_nucs_2))
2600

    
2601
  # print("Dealing with unr and wp...")
2602
  non_inter_unr = substract_region(all_unr, all_wp)
2603
  non_inter_unr = crop_fuzzy(non_inter_unr, roi, combi[1], config)
2604
  if (is.null(non_inter_unr)) { return(NULL) }
2605
  non_inter_unr$len = non_inter_unr$upper_bound - non_inter_unr$lower_bound
2606
  min_unr_width = 75
2607
  non_inter_unr = non_inter_unr[non_inter_unr$len >= min_unr_width,]
2608

    
2609
  non_inter_unr$index_nuc = 1:length(non_inter_unr[,1])
2610
  return (non_inter_unr)
2611
}
2612

    
2613

    
2614