Révision 780f632a src/R/nucleominer.R
b/src/R/nucleominer.R | ||
---|---|---|
2 | 2 |
marker, |
3 | 3 |
combi, |
4 | 4 |
form, |
5 |
cur_index |
|
5 |
cur_index, |
|
6 |
all_samples, ##<< A table that describe all our samples. |
|
7 |
config=NULL ##<< GLOBAL config variable. |
|
6 | 8 |
) { |
7 | 9 |
print(paste("Counting reads for ", form, " CUR ", cur_index, " in ", marker, " for [", combi[1], ",", combi[2], "].", sep="")) |
8 | 10 |
nucs = list() |
... | ... | |
39 | 41 |
manip = marker |
40 | 42 |
{ |
41 | 43 |
for (strain in combi) { |
42 |
for(sample in all_samples[all_samples$marker == manip & all_samples$strain == strain, ]$id) { |
|
44 |
for(sample_id in all_samples[all_samples$marker == manip & all_samples$strain == strain, ]$id) {
|
|
43 | 45 |
# tmp_filename = paste(config$ALIGN_DIR, "/TF/splited/sample_", sample, "_chr_", res[[paste("chr", strain, sep="_")]], "_splited_sample.tab.gz",sep="") |
44 |
tmp_filename = paste(config$ALIGN_DIR, "/TF/sample_", sample, "_TF.txt", sep="") |
|
45 |
tmp_unnorm_reads = mread.table(tmp_filename, stringsAsFactors=FALSE) |
|
46 |
if ("tf_input" %in% names(all_samples)) { |
|
47 |
sample_inputs_filename = all_samples$tf_input[all_samples$id==sample_id] |
|
48 |
} else { |
|
49 |
sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", sample_id, "_TF.txt", sep="") |
|
50 |
} |
|
51 |
tmp_unnorm_reads = mread.table(sample_inputs_filename, stringsAsFactors=FALSE) |
|
46 | 52 |
names(tmp_unnorm_reads) = c("chr", "pos", "strand", "nb_reads") |
47 | 53 |
orig_cur = list(name = "foo", |
48 | 54 |
begin = res[[paste("lower_bound", strain, sep="_")]], |
... | ... | |
51 | 57 |
strain_ref = strain) |
52 | 58 |
orig_cur$length = orig_cur$end - orig_cur$begin + 1 |
53 | 59 |
tmp_nuc_reads = filter_tf_inputs(tmp_unnorm_reads, orig_cur$chr, orig_cur$begin, orig_cur$end, orig_cur$length) |
54 |
res[[paste(strain, manip, sample, sep="_")]] = sum(tmp_nuc_reads[,4]) |
|
60 |
res[[paste(strain, manip, sample_id, sep="_")]] = sum(tmp_nuc_reads[,4])
|
|
55 | 61 |
} |
56 | 62 |
} |
57 | 63 |
} |
... | ... | |
69 | 75 |
combi, ##<< The combinations of strains that we want to build the count table. |
70 | 76 |
form, ##<< The nucleosome that we want to observe: "wp" for sel;l position and "unr" for UNR. |
71 | 77 |
curs, ##<< The list of CURs |
78 |
all_samples, ##<< A table that describe all our samples. |
|
72 | 79 |
config=NULL ##<< GLOBAL config variable. |
73 | 80 |
) { |
81 |
dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE) |
|
74 | 82 |
all_res = apply(t(1:nrow(curs)), 2, function(cur_index) { |
75 |
res = count_reads_cur(marker=marker, combi=combi, form=form, cur_index=cur_index) |
|
83 |
res = count_reads_cur(marker=marker, combi=combi, form=form, cur_index=cur_index, all_samples, config)
|
|
76 | 84 |
return(res) |
77 | 85 |
}) |
78 | 86 |
vec_names = names(all_res[[1]][[1]]) |
... | ... | |
130 | 138 |
curs, ##<< The list of CURs |
131 | 139 |
config=NULL##<< GLOBAL config variable. |
132 | 140 |
) { |
141 |
dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE) |
|
133 | 142 |
### build wp maps |
134 | 143 |
glo_results = apply(t(1:1:nrow(curs)), 2, function(cur_index) { |
135 | 144 |
dyad_shift = wp_llr = strain_maps = common_nuc_results = intra_llrs = inter_llrs = list() |
... | ... | |
1021 | 1030 |
sample$roi$genome = mread.fasta(config$FASTA_REFERENCE_GENOME_FILES[[sample$strain]])[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end] |
1022 | 1031 |
} |
1023 | 1032 |
# Get inputs |
1024 |
sample$inputs = mread.table(paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep=""), stringsAsFactors=FALSE) |
|
1033 |
if ("tf_input" %in% names(all_samples)) { |
|
1034 |
sample_inputs_filename = all_samples$tf_input[all_samples$id==i] |
|
1035 |
} else { |
|
1036 |
sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="") |
|
1037 |
} |
|
1038 |
sample$inputs = mread.table(sample_inputs_filename, stringsAsFactors=FALSE) |
|
1025 | 1039 |
sample$total_reads = sum(sample$inputs[,4]) |
1026 | 1040 |
if (!only_fetch) { |
1027 | 1041 |
sample$inputs = filter_tf_inputs(sample$inputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300) |
1028 | 1042 |
} |
1029 | 1043 |
# Get TF outputs for Mnase_Seq samples |
1030 | 1044 |
if (sample$marker == "Mnase_Seq" & get_ouputs) { |
1031 |
sample$outputs = mread.table(paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep=""), header=TRUE, sep="\t") |
|
1045 |
if ("tf_output" %in% names(all_samples)) { |
|
1046 |
sample_outputs_filename = all_samples$tf_output[all_samples$id==i] |
|
1047 |
} else { |
|
1048 |
sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="") |
|
1049 |
} |
|
1050 |
sample$outputs = mread.table(sample_outputs_filename, header=TRUE, sep="\t") |
|
1032 | 1051 |
if (!only_fetch) { |
1033 | 1052 |
sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300) |
1034 | 1053 |
} |
... | ... | |
1323 | 1342 |
legend("topright", col=(1:length(sample_ids))+1, lty=1:length(sample_ids), legend=cols) |
1324 | 1343 |
} |
1325 | 1344 |
|
1326 |
analyse_design = function(# Launch DESeq methods. |
|
1327 |
### This function is based on DESeq example. It normalizes data, fit data to GLM model with and without interaction term and compares the two models. |
|
1328 |
snep_design, ##<< The design to consider. |
|
1329 |
reads ##<< The data to consider. |
|
1330 |
) { |
|
1331 |
snep_count_table = reads[, rownames(snep_design)] |
|
1332 |
cdsFull = newCountDataSet(snep_count_table, snep_design) |
|
1333 |
cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
|
1334 |
fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain) |
|
1335 |
|
|
1336 |
cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
|
1337 |
fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain) |
|
1338 |
|
|
1339 |
pvalsGLM = nbinomGLMTest( fit1, fit0 ) |
|
1340 |
return(list(fit1, fit0, snep_design, pvalsGLM)) |
|
1341 |
} |
|
1342 | 1345 |
|
1343 | 1346 |
|
1344 | 1347 |
|
1345 |
|
|
1346 |
|
|
1347 |
|
|
1348 |
|
|
1349 |
|
|
1350 |
get_sneps = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form. |
|
1348 |
analyse_count_table = structure(function(# Compute the list of SNEPs for a given set of marker, strain combination and nuc form. |
|
1351 | 1349 |
### This function uses |
1352 | 1350 |
marker, ##<< The marker involved. |
1353 | 1351 |
combi, ##<< The strain combination involved. |
... | ... | |
1358 | 1356 |
) { |
1359 | 1357 |
# PRETREAT |
1360 | 1358 |
snep_design = get_design(marker, combi, all_samples) |
1361 |
reads = get_all_reads(marker, combi, form, config=config) |
|
1362 |
# RUN ANALYSE |
|
1363 |
tmp_analyse = analyse_design(snep_design, reads) |
|
1364 |
# RESULTS |
|
1365 |
fit1 = tmp_analyse[[1]] |
|
1366 |
fit0 = tmp_analyse[[2]] |
|
1359 |
mnase_design = snep_design[snep_design$manip == "Mnase_Seq", ] |
|
1360 |
snep_reads = get_all_reads(marker, combi, form, config=config) |
|
1361 |
snep_count_table = snep_reads[, rownames(snep_design)] |
|
1362 |
mnase_count_table = snep_reads[, rownames(mnase_design)] |
|
1363 |
# RUN ANALYSE FOPR SNEPS |
|
1364 |
cdsFull = newCountDataSet(snep_count_table, snep_design) |
|
1365 |
cdsFull1 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
|
1366 |
fit1 = fitNbinomGLMs(cdsFull1, count ~ manip * strain) |
|
1367 |
cdsFull0 = estimateDispersions(estimateSizeFactors(cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
|
1368 |
fit0 = fitNbinomGLMs(cdsFull0, count ~ manip + strain) |
|
1369 |
pvalsGLM = nbinomGLMTest( fit1, fit0 ) |
|
1370 |
# RUN ANALYSE FOR MNASE |
|
1371 |
mnase_cdsFull = newCountDataSet(mnase_count_table, mnase_design$strain) |
|
1372 |
mnase_cdsFull1 = estimateDispersions(estimateSizeFactors(mnase_cdsFull), fitType="local", method="pooled", sharingMode="maximum") |
|
1373 |
res_mnase = nbinomTest( mnase_cdsFull1, combi[1], combi[2]) |
|
1374 |
# GLOBAL RESULTS |
|
1375 |
# SNEPS |
|
1367 | 1376 |
k = names(fit1) |
1368 |
reads[[k[2]]] = signif(fit1[[k[2]]], 5) |
|
1369 |
reads[[k[3]]] = signif(fit1[[k[3]]], 5) |
|
1370 |
reads[[k[4]]] = signif(fit1[[k[4]]], 5) |
|
1371 |
reads$pvalsGLM = signif(tmp_analyse[[4]], 5) |
|
1372 |
snep_design = tmp_analyse[[3]] |
|
1377 |
snep_reads[[k[2]]] = signif(fit1[[k[2]]], 5) |
|
1378 |
snep_reads[[k[3]]] = signif(fit1[[k[3]]], 5) |
|
1379 |
snep_reads[[k[4]]] = signif(fit1[[k[4]]], 5) |
|
1380 |
snep_reads$pvalsGLM = signif(pvalsGLM, 5) |
|
1373 | 1381 |
# print(snep_design) |
1374 |
thres = FDR(reads$pvalsGLM, FDR) |
|
1375 |
reads$snep_index = reads$pvalsGLM < thres |
|
1376 |
print(paste(sum(reads$snep_index), " SNEPs found for ", length(reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = "")) |
|
1377 |
return(reads) |
|
1382 |
thres = FDR(snep_reads$pvalsGLM, FDR) |
|
1383 |
snep_reads$snep_index = snep_reads$pvalsGLM < thres |
|
1384 |
print(paste(sum(snep_reads$snep_index), " SNEPs found for ", length(snep_reads[,1])," nucs and ", FDR*100,"% of FDR.", sep = "")) |
|
1385 |
# MNASE |
|
1386 |
snep_reads[["mnase_l2fc"]] = signif(res_mnase$log2FoldChange, 5) |
|
1387 |
snep_reads[["mnase_l2fc_pval"]] = signif(res_mnase$pval, 5) |
|
1388 |
# write results |
|
1389 |
snep_filename = paste(config$RESULTS_DIR, "/" ,combi[1],"_",combi[2],"_",marker,"_", form, "_snep.tab",sep="") |
|
1390 |
write.table(snep_reads, file=snep_filename, row.names=FALSE, quote=FALSE) |
|
1391 |
return(snep_reads) |
|
1378 | 1392 |
}, ex=function(){ |
1379 | 1393 |
marker = "H3K4me1" |
1380 | 1394 |
combi = c("BY", "YJM") |
1381 | 1395 |
form = "wpunr" # "wp" | "unr" | "wpunr" |
1382 |
# foo = get_sneps(marker, combi, form)
|
|
1383 |
# foo = get_sneps("H4K12ac", c("BY", "RM"), "wp")
|
|
1396 |
# foo = analyse_count_table(marker, combi, form)
|
|
1397 |
# foo = analyse_count_table("H4K12ac", c("BY", "RM"), "wp")
|
|
1384 | 1398 |
}) |
1385 | 1399 |
|
1386 | 1400 |
|
... | ... | |
1680 | 1694 |
}) |
1681 | 1695 |
|
1682 | 1696 |
|
1683 |
compute_inter_all_strain_curs = function (# Compute Common Uninterrupted Regions (CUR)
|
|
1697 |
compute_curs = function (# Compute Common Uninterrupted Regions (CUR) |
|
1684 | 1698 |
### CURs are regions that can be aligned between the genomes |
1685 | 1699 |
diff_allowed = 30, ##<< the maximum indel width allowe din a CUR |
1686 | 1700 |
min_cur_width = 4000, ##<< The minimum width of a CUR |
1687 | 1701 |
combis = list(c("BY", "RM"), c("BY", "YJM"), c("RM", "YJM")), ##<< list of strain than will be tested as uninterrupted regions |
1688 | 1702 |
config = NULL ##<< GLOBAL config variable |
1689 | 1703 |
) { |
1690 |
|
|
1704 |
dir.create(config$RESULTS_DIR, recursive=TRUE, showWarnings=FALSE) |
|
1691 | 1705 |
check_overlaping = function(strain1, strain2, chr, lower_bound, upper_bound, config=NULL) { |
1692 | 1706 |
c2c = c2c_extraction(strain1, strain2, chr, lower_bound, upper_bound, config=config) |
1693 | 1707 |
check_homogeneity(c2c) |
... | ... | |
1750 | 1764 |
}) |
1751 | 1765 |
return(do.call(rbind, rois)) |
1752 | 1766 |
} |
1753 |
# foo_orig = compute_inter_all_strain_curs2(config=config)
|
|
1767 |
# foo_orig = compute_curs2(config=config) |
|
1754 | 1768 |
# foo = foo_orig |
1755 | 1769 |
STOP = FALSE |
1756 | 1770 |
nb_round = 0 |
... | ... | |
1903 | 1917 |
} |
1904 | 1918 |
|
1905 | 1919 |
if (length(combis)==1) { |
1906 |
return(rois[[1]])
|
|
1920 |
curs = rois[[1]]
|
|
1907 | 1921 |
} else { |
1908 | 1922 |
reducted_1_rois = intersect_region(rois[["BY_RM"]], rois[["BY_YJM"]]) |
1909 | 1923 |
reducted_1_rois = reducted_1_rois[reducted_1_rois$length >= min_cur_width, ] |
... | ... | |
1913 | 1927 |
reducted_rois = translate_curs(reducted_2_rois, "BY", config) |
1914 | 1928 |
reducted_rois = reducted_rois[order(as.numeric(reducted_rois$chr), reducted_rois$begin), ] |
1915 | 1929 |
squeezed_rois = test_and_squeeze_rois(reducted_rois, config=config) |
1916 |
return(squeezed_rois)
|
|
1930 |
curs = squeezed_rois
|
|
1917 | 1931 |
} |
1932 |
|
|
1933 |
cur_filename = paste(config$RESULTS_DIR, "/all_curs.tab", sep="") |
|
1934 |
write.table(curs, file=cur_filename, row.names=FALSE, quote=FALSE) |
|
1935 |
return(curs) |
|
1918 | 1936 |
} |
1919 | 1937 |
|
1920 | 1938 |
intersect_region = function(# Returns the intersection of 2 list on regions. |
... | ... | |
1974 | 1992 |
sample$roi$genome = mread.fasta(fasta_ref_filename, )[[switch_pairlist(config$FASTA_INDEXES[[sample$strain]])[[sample$roi$chr]]]][sample$roi$begin:sample$roi$end] |
1975 | 1993 |
} |
1976 | 1994 |
# Get inputs |
1995 |
if ("tf_input" %in% names(all_samples)) { |
|
1996 |
sample_inputs_filename = all_samples$tf_input[all_samples$id==i] |
|
1997 |
} else { |
|
1998 |
sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="") |
|
1999 |
} |
|
1977 | 2000 |
sample_inputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_TF.txt", sep="") |
1978 | 2001 |
sample$inputs = mread.table(sample_inputs_filename, stringsAsFactors=FALSE) |
1979 | 2002 |
sample$total_reads = sum(sample$inputs[,4]) |
... | ... | |
1982 | 2005 |
} |
1983 | 2006 |
# Get TF outputs for Mnase_Seq samples |
1984 | 2007 |
if (sample$marker == "Mnase_Seq" & get_ouputs) { |
1985 |
sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="") |
|
2008 |
if ("tf_output" %in% names(all_samples)) { |
|
2009 |
sample_outputs_filename = all_samples$tf_output[all_samples$id==i] |
|
2010 |
} else { |
|
2011 |
sample_outputs_filename = paste(config$ALIGN_DIR, "/TF/sample_", i, "_all_nucs.tab", sep="") |
|
2012 |
} |
|
1986 | 2013 |
sample$outputs = mread.table(sample_outputs_filename, header=TRUE, sep="\t") |
1987 | 2014 |
if (!only_fetch) { |
1988 | 2015 |
sample$outputs = filter_tf_outputs(sample$outputs, sample$roi$chr, min(sample$roi$begin, sample$roi$end), max(sample$roi$begin, sample$roi$end), 300) |
Formats disponibles : Unified diff