Révision 780f632a src/R/nucleominer.R

b/src/R/nucleominer.R
2 2
marker,
3 3
combi,
4 4
form,
5
cur_index
5
cur_index,
6
all_samples, ##<< A table that describe all our samples.
7
config=NULL ##<< GLOBAL config variable.
6 8
) {
7 9
	print(paste("Counting reads for ", form, " CUR ", cur_index, " in ", marker, " for [", combi[1], ",", combi[2], "].", sep=""))
8 10
	nucs = list()
......
39 41
    	manip = marker
40 42
    	{			
41 43
    		for (strain in combi) {				
42
    			for(sample in all_samples[all_samples$marker == manip & all_samples$strain == strain, ]$id) {
44
    			for(sample_id in all_samples[all_samples$marker == manip & all_samples$strain == strain, ]$id) {
43 45
            # tmp_filename = paste(config$ALIGN_DIR, "/TF/splited/sample_", sample, "_chr_", res[[paste("chr", strain, sep="_")]], "_splited_sample.tab.gz",sep="")
44
    				tmp_filename = paste(config$ALIGN_DIR, "/TF/sample_", sample, "_TF.txt", sep="")		
45
    				tmp_unnorm_reads = mread.table(tmp_filename, stringsAsFactors=FALSE)					
46
            if ("tf_input" %in% names(all_samples)) {
47
              sample_inputs_filename = all_samples$tf_input[all_samples$id==sample_id]
48
            } else {
49
              sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", sample_id, "_TF.txt", sep="")
50
            }
51
    				tmp_unnorm_reads = mread.table(sample_inputs_filename, stringsAsFactors=FALSE)					
46 52
            names(tmp_unnorm_reads) = c("chr", "pos", "strand", "nb_reads")
47 53
    				orig_cur = list(name = "foo", 
48 54
                                begin = res[[paste("lower_bound", strain, sep="_")]], 
......
51 57
    														strain_ref = strain)
52 58
    				orig_cur$length = orig_cur$end - orig_cur$begin + 1
53 59
    				tmp_nuc_reads = filter_tf_inputs(tmp_unnorm_reads, orig_cur$chr, orig_cur$begin, orig_cur$end, orig_cur$length) 
54
    				res[[paste(strain, manip, sample, sep="_")]] = sum(tmp_nuc_reads[,4])
60
    				res[[paste(strain, manip, sample_id, sep="_")]] = sum(tmp_nuc_reads[,4])
55 61
    			}
56 62
    		}
57 63
    	}
......
69 75
combi, ##<< The combinations of strains that we want to build the count table.
70 76
form, ##<< The nucleosome that we want to observe: "wp" for sel;l position and "unr" for UNR.
71 77
curs, ##<< The list of CURs
78
all_samples, ##<< A table that describe all our samples.
72 79
config=NULL ##<< GLOBAL config variable.
73 80
) {
81
  dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE)
74 82
	all_res = apply(t(1:nrow(curs)), 2, function(cur_index) {
75
    res = count_reads_cur(marker=marker, combi=combi, form=form, cur_index=cur_index)
83
    res = count_reads_cur(marker=marker, combi=combi, form=form, cur_index=cur_index, all_samples, config)
76 84
		return(res)
77 85
	})
78 86
	vec_names = names(all_res[[1]][[1]])
......
130 138
curs, ##<< The list of CURs
131 139
config=NULL##<< GLOBAL config variable.
132 140
) {
141
  dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE)
133 142
  ### build wp maps
134 143
	glo_results = apply(t(1:1:nrow(curs)), 2, function(cur_index) {
135 144
  	dyad_shift = wp_llr = strain_maps = common_nuc_results = intra_llrs = inter_llrs = list()
......
1021 1030
      sample$roi$genome = mread.fasta(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]])[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1022 1031
		}
1023 1032
		# Get inputs
1024
		sample$inputs = mread.table(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), stringsAsFactors=FALSE)
1033
    if ("tf_input" %in% names(all_samples)) {
1034
      sample_inputs_filename = all_samples$tf_input[all_samples$id==i]
1035
    } else {
1036
      sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1037
    }
1038
		sample$inputs = mread.table(sample_inputs_filename, stringsAsFactors=FALSE)
1025 1039
		sample$total_reads = sum(sample$inputs[,4])
1026 1040
		if (!only_fetch) {
1027 1041
		  sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1028 1042
	  }
1029 1043
	  # Get TF outputs for Mnase_Seq samples
1030 1044
		if (sample$marker == "Mnase_Seq" & get_ouputs) {
1031
			sample$outputs = mread.table(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), header=TRUE, sep="\t")
1045
      if ("tf_output" %in% names(all_samples)) {
1046
        sample_outputs_filename = all_samples$tf_output[all_samples$id==i]
1047
      } else {
1048
        sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")                
1049
      }
1050
      sample$outputs = mread.table(sample_outputs_filename, header=TRUE, sep="\t")
1032 1051
			if (!only_fetch) {
1033 1052
	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)
1034 1053
  		}
......
1323 1342
  legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols)
1324 1343
}
1325 1344

  
1326
analyse_design = function(# Launch DESeq methods.
1327
### This function is based on DESeq example. It normalizes data, fit data to GLM model with and without interaction term and compares the two models.
1328
snep_design, ##<< The design to consider.
1329
reads ##<< The data to consider.
1330
) {
1331
	snep_count_table = reads[, rownames(snep_design)]
1332
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1333
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1334
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1335

  
1336
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1337
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1338

  
1339
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1340
	return(list(fit1, fit0, snep_design, pvalsGLM))
1341
}
1342 1345

  
1343 1346

  
1344 1347

  
1345

  
1346

  
1347

  
1348

  
1349

  
1350
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1348
analyse_count_table = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form.
1351 1349
### This function uses
1352 1350
marker, ##<< The marker involved.
1353 1351
combi, ##<< The strain combination involved.
......
1358 1356
) {
1359 1357
  # PRETREAT
1360 1358
  snep_design = get_design(marker, combi, all_samples)
1361
  reads = get_all_reads(marker, combi, form, config=config)
1362
  # RUN ANALYSE
1363
  tmp_analyse = analyse_design(snep_design, reads)
1364
  # RESULTS
1365
	fit1 = tmp_analyse[[1]]
1366
	fit0 = tmp_analyse[[2]]
1359
  mnase_design = snep_design[snep_design$manip == "Mnase_Seq", ]
1360
  snep_reads = get_all_reads(marker, combi, form, config=config)
1361
	snep_count_table = snep_reads[, rownames(snep_design)]
1362
  mnase_count_table = snep_reads[, rownames(mnase_design)]
1363
  # RUN ANALYSE FOPR SNEPS
1364
	cdsFull = newCountDataSet(snep_count_table, snep_design)
1365
	cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1366
	fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain)
1367
	cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1368
	fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain)
1369
	pvalsGLM = nbinomGLMTest( fit1, fit0 )
1370
  # RUN ANALYSE FOR MNASE
1371
  mnase_cdsFull = newCountDataSet(mnase_count_table, mnase_design$strain)
1372
  mnase_cdsFull1 = estimateDispersions(estimateSizeFactors(mnase_cdsFull), fitType="local", method="pooled", sharingMode="maximum")
1373
  res_mnase = nbinomTest( mnase_cdsFull1, combi[1], combi[2])
1374
  # GLOBAL RESULTS
1375
  #   SNEPS
1367 1376
  k = names(fit1)
1368
  reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1369
  reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1370
  reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1371
	reads$pvalsGLM = signif(tmp_analyse[[4]], 5)
1372
	snep_design = tmp_analyse[[3]]
1377
  snep_reads[[k[2]]] = signif(fit1[[k[2]]], 5)
1378
  snep_reads[[k[3]]] = signif(fit1[[k[3]]], 5)
1379
  snep_reads[[k[4]]] = signif(fit1[[k[4]]], 5)
1380
	snep_reads$pvalsGLM = signif(pvalsGLM, 5)
1373 1381
  # print(snep_design)
1374
	thres = FDR(reads$pvalsGLM, FDR)
1375
	reads$snep_index = reads$pvalsGLM < thres
1376
	print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = ""))
1377
  return(reads)
1382
	thres = FDR(snep_reads$pvalsGLM, FDR)
1383
	snep_reads$snep_index = snep_reads$pvalsGLM < thres
1384
  print(paste(sum(snep_reads$snep_index), " SNEPs found for ", length(snep_reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = ""))
1385
  #   MNASE
1386
  snep_reads[["mnase_l2fc"]] = signif(res_mnase$log2FoldChange, 5)
1387
  snep_reads[["mnase_l2fc_pval"]] = signif(res_mnase$pval, 5)
1388
  # write results
1389
	snep_filename = paste(config$RESULTS_DIR, "/" ,combi[1],"_",combi[2],"_",marker,"_", form, "_snep.tab",sep="")
1390
	write.table(snep_reads, file=snep_filename, row.names=FALSE, quote=FALSE)      
1391
  return(snep_reads)
1378 1392
  },  ex=function(){
1379 1393
    marker = "H3K4me1"
1380 1394
    combi = c("BY", "YJM")
1381 1395
    form = "wpunr" # "wp" | "unr" | "wpunr"
1382
    # foo = get_sneps(marker, combi, form)
1383
    # foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
1396
    # foo = analyse_count_table(marker, combi, form)
1397
    # foo = analyse_count_table("H4K12ac", c("BY", "RM"), "wp")
1384 1398
})
1385 1399

  
1386 1400

  
......
1680 1694
})
1681 1695

  
1682 1696

  
1683
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
1697
compute_curs = function (# Compute Common Uninterrupted Regions (CUR)
1684 1698
### CURs are regions that can be aligned between the genomes
1685 1699
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR
1686 1700
min_cur_width = 4000, ##<< The minimum width of a CUR
1687 1701
combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM")), ##<< list of strain than will be tested as uninterrupted regions
1688 1702
config = NULL ##<< GLOBAL config variable
1689 1703
) {
1690

  
1704
  dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE)
1691 1705
  check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) {
1692 1706
    c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config)
1693 1707
    check_homogeneity(c2c)
......
1750 1764
      })
1751 1765
      return(do.call(rbind, rois))
1752 1766
    }
1753
    # foo_orig = compute_inter_all_strain_curs2(config=config)
1767
    # foo_orig = compute_curs2(config=config)
1754 1768
    # foo = foo_orig
1755 1769
    STOP = FALSE
1756 1770
    nb_round = 0
......
1903 1917
  }
1904 1918

  
1905 1919
  if (length(combis)==1) {
1906
    return(rois[[1]])
1920
    curs = rois[[1]]
1907 1921
  } else {
1908 1922
    reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]])
1909 1923
    reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ]
......
1913 1927
    reducted_rois = translate_curs(reducted_2_rois, "BY", config)  
1914 1928
    reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ]
1915 1929
    squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config)
1916
    return(squeezed_rois)    
1930
    curs = squeezed_rois
1917 1931
  }
1932
    
1933
  cur_filename = paste(config$RESULTS_DIR, "/all_curs.tab", sep="")
1934
  write.table(curs, file=cur_filename, row.names=FALSE, quote=FALSE)  
1935
  return(curs)
1918 1936
}
1919 1937

  
1920 1938
intersect_region = function(# Returns the intersection of 2 list on regions.
......
1974 1992
  			sample$roi$genome = mread.fasta(fasta_ref_filename, )[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end]
1975 1993
  		}
1976 1994
  		# Get inputs
1995
      if ("tf_input" %in% names(all_samples)) {
1996
        sample_inputs_filename = all_samples$tf_input[all_samples$id==i]
1997
      } else {
1998
        sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1999
      }
1977 2000
  		sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="")
1978 2001
  		sample$inputs = mread.table(sample_inputs_filename, stringsAsFactors=FALSE)
1979 2002
  		sample$total_reads = sum(sample$inputs[,4])
......
1982 2005
  	  }
1983 2006
  	  # Get TF outputs for Mnase_Seq samples
1984 2007
  		if (sample$marker == "Mnase_Seq" & get_ouputs) {
1985
  			sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")
2008
        if ("tf_output" %in% names(all_samples)) {
2009
          sample_outputs_filename = all_samples$tf_output[all_samples$id==i]
2010
        } else {
2011
          sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="")                
2012
        }
1986 2013
  			sample$outputs = mread.table(sample_outputs_filename, header=TRUE, sep="\t")
1987 2014
  			if (!only_fetch) {
1988 2015
  	  		sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr,  min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300)

Formats disponibles : Unified diff