-
Notifications
You must be signed in to change notification settings - Fork 0
/
de.R
117 lines (93 loc) · 4.15 KB
/
de.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# read in gene names to remove
tcr_genes <- read_tsv('tcr_genes.tsv')$'Approved symbol'
bcr_genes <- read_tsv('bcr_genes.tsv')$'Approved symbol'
# Read in pseudobulk counts as a tibble and convert to data frame
read_counts <- function(counts_filepath){
counts <- read_csv(counts_filepath)
counts_rownames <- counts$featurekey
counts_colnames <- colnames(counts)[-1]
counts <- counts %>%
dplyr::select(-featurekey) %>%
round() %>%
data.frame()
rownames(counts) <- counts_rownames
colnames(counts) <- counts_colnames
return(counts)
}
# Read in metadata
read_meta <- function(meta_filepath){
meta <- read.csv(meta_filepath, row.names=1)
meta <- meta %>%
filter(condition == 'myocarditis' | condition == 'control') %>%
mutate(condition = factor(condition, levels=c('control', 'myocarditis')))
return(meta)
}
# Run DE analysis
run_de_by_comp_var <- function(counts, meta, save_name, comp_var_contrast_vec,
deseq_formula=NULL, cell_cutoff=5){
# filter for samples in metadata
counts <- dplyr::select(counts, rownames(meta))
# get comparison variable
comp_var <- comp_var_contrast_vec[1]
# create dataframe to hold all results for all clusters
all_res <- data.frame(baseMean=numeric(), log2FoldChange=numeric(), lfcSE=numeric(),
stat=numeric(), pvalue=numeric(), padj=numeric(),
gene_symbol=character(), cluster=numeric())
# run DE and make plot for each cluster
for (cluster in sort(unique(meta$cluster))) {
print(paste0('Cluster ', cluster))
# filter for cluster
meta_cluster <- meta[meta$cluster == cluster,]
counts_cluster <- counts[,rownames(meta)]
# skip clusters with too few cells
if (sum(meta_cluster$n_cells) <= 100) {next}
# filtering for sample/clusters that have more than cell_cutoff cells
meta_cluster <- meta_cluster[meta_cluster$n_cells >= cell_cutoff, ]
tot_cells <- sum(meta_cluster$n_cells)
# skip if design variable has only one level
if (length(unique(meta_cluster[,comp_var])) == 1) {next}
# filter the count matrix with sample/clusters that have more than cell_cutoff cells
counts_cluster <- counts_cluster[,rownames(meta_cluster)]
# for each gene, record how many samples have a non-zero count
n_samp <- rowSums(counts_cluster != 0)
# keep genes where least half of the samples have a non-zero count for that gene
counts_cluster <- counts_cluster[n_samp > (nrow(meta_cluster) / 2),]
# filter out tcr and bcr genes
counts_cluster <- counts_cluster[!(rownames(counts_cluster) %in% c(tcr_genes, bcr_genes)),]
# make sure count column names match metadata rownames for DESeq2
stopifnot(colnames(counts_cluster) == rownames(meta_cluster))
# get formula
if (is.null(deseq_formula)) {
deseq_formula <- formula(glue("~ {comp_var}"))
}
# run DESeq2
dds <- DESeqDataSetFromMatrix(countData = counts_cluster,
colData = meta_cluster,
design = deseq_formula)
dds <- DESeq(dds)
res <- as.data.frame(results(dds, contrast=comp_var_contrast_vec))
res <- res[!is.na(res$padj),]
# just for the volcano plot
res$gene_symbol <- rownames(res)
# concatenate cluster results to all_res
res$cluster <- cluster
all_res <- rbind(all_res, res)
}
print('saving results...')
write_csv(all_res, glue('{save_name}_de_by_{comp_var}_all_results.csv'))
return(all_res)
}
get_heatmap_data <- function(de_results, heatmap_genes, cluster_map){
# filter de results for select heatmap genes and add umap names
de_results %>%
mutate(cluster = as.character(cluster)) %>%
dplyr::select(gene_symbol, cluster, log2FoldChange, padj) %>%
inner_join(heatmap_genes, by = 'gene_symbol') %>%
left_join(cluster_map %>%
mutate(cluster_number = as.character(cluster_number)),
by = c('cluster' = 'cluster_number')) %>%
filter(!cluster_name %in% c('Doublets/RBCs', 'other')) %>%
mutate(cluster_name = factor(cluster_name)) %>%
complete(gene_symbol, cluster_name) %>%
arrange(gene_symbol)
}