-
Notifications
You must be signed in to change notification settings - Fork 0
/
ccf_with_confidence_intervals.R
73 lines (58 loc) · 3.01 KB
/
ccf_with_confidence_intervals.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Calculcate cancer cell fractions and 95% confidence intervals (CCFs) for tumor mutations with given:
# - purity (cellularity)
# - number of ref, and alt reads
# - major and minor copy number (larger and smaller allele specific integer copy numbers)
# - normal copy number (number of copies at given locus in normal human genome)
#
# code is based on this unpublished MSKCC github repo: https://github.com/mskcc/facets-suite/blob/master/R/ccf-annotate-maf.R
# concepts are based on the referenced PMIDs (28270531, 25877892)
#
# input data: tab-delimited table with fields mutation_id, sample_id, purity, t_ref_count, t_alt_count, major_cn, minor_cn, normal_cn
# output data: same as above with added fields: t_depth, total_copies, mutant_copies, ccf, ccf_lwr95, ccf_upr95
# Estimate mutant copy number, given observed VAF, purity, and local ploidy
# Based on PMID 28270531
expected_mutant_copies = function(t_var_freq, total_copies, purity) {
if (is.na(total_copies)) {
as.integer(NA)
} else {
if (total_copies == 0) total_copies = 1
mu = t_var_freq * (1 / purity) * (purity * total_copies + (1 - purity) * 2)
alt_copies = ifelse(mu < 1, 1, abs(mu)) # mu < 1 ~ 1, mu >= 1 ~ abs(mu)
alt_copies = ifelse(alt_copies > total_copies, total_copies, alt_copies) # don't allow expected number greater than the total
round(alt_copies)
}
}
# Estimate most likely CCF given observed VAF, purity and local ploidy
# Based on PMID 25877892
estimate_ccf = function(purity, total_copies, mutant_copies, t_alt_count, t_depth) {
expected_vaf = function(ccf, purity, total_copies) {
purity * ccf * mutant_copies / (2 * (1 - purity) + purity * total_copies)
}
ccfs = seq(0.000, 1, 0.001)
probs = sapply(ccfs, function(c) {
stats::dbinom(t_alt_count, t_depth, expected_vaf(c, purity, total_copies)) })
probs = probs / sum(probs)
ccf_mostlikely_index = which.max(probs)
ccf_value <- ccfs[ccf_mostlikely_index]
## empirical CDF to determine 95% CIs for the CCF
CDF <- cumsum(probs)
# closest CCF outside the lower bound of the 95% CI
below95 <- which(CDF <= 0.025)
ccf_lwr95_index <- max(below95)
ccf_lwr95 <- ccfs[ccf_lwr95_index]
# closest CCF outside the upper bound of the 95% CI
above95 <- which(CDF >= 0.975)
ccf_upr95_index <- min(above95)
ccf_upr95 <- ccfs[ccf_upr95_index]
list(ccf=ccf_value, ccf_lwr95=ccf_lwr95, ccf_upr95=ccf_upr95)
}
library(data.table)
d <- fread('<input_data.tsv>')
d[,t_var_freq:=t_alt_count / (t_alt_count + t_ref_count)]
d[,total_copies:=major_cn + minor_cn]
d[,t_depth:=t_alt_count + t_ref_count]
d[, mutant_copies := expected_mutant_copies(t_var_freq, total_copies, purity), by = seq_len(nrow(d))]
d[, c('ccf','ccf_lwr95','ccf_upr95') := estimate_ccf(purity, total_copies, mutant_copies, t_alt_count, t_depth), by = seq_len(nrow(d))]
d[ccf==0 & is.na(ccf_lwr95), ccf_lwr95:=0]
d[ccf==1 & is.na(ccf_upr95), ccf_upr95:=1]
write.table(d, '<output_data.tsv>', sep='\t', quote=F, row.names=F)