-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patheda.Rmd
216 lines (166 loc) · 6.92 KB
/
eda.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
---
title: "Exploration of how the totla UMIs vary across cell types"
author: "Rafael Irizarry"
date: "11/20/2019"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Introduction
Most normalization procedures apply corrections that remove global effects. Here we explore the possibility that some of these global effects are biologically drive. In particular, we explore the total UMIs per cell which, in principle, should be related to cell size, for example.
## Read and wrangle data
For this code to work the rds and rda files must be in a directory called `rdas` in the home directory of this document.
```{r}
tmp <- readRDS("rdas/bm.cite.rds")
## get rid of S4. Make a count matrix and annotation data frame.
pd <- [email protected]
y <- as.matrix(tmp[["RNA"]]@counts)
adt <- as.matrix(tmp@assays$ADT@counts)
pd$celltype[pd$celltype == "CD1C DC"] <- "CD1c DC"
## remove the Prog as they seem very different and obscure the analysis
ind <- which(!grepl("Prog", pd$celltype))
y <- y[,ind]
adt <- adt[,ind]
pd <- pd[ind,]
```
## Reannotate data using an approach completely independent of count data
We will cluster the protein data. We then use the previous annotation to assign it a name, but the point is to define clusters, not to annotate cell-types.
```{r}
z <- log2(adt+0.5)
z <- sweep(z, 2, colMeans(z)) ## remove the cell effect... try without
colnames(z) <- pd$celltype
d <- dist(t(z))
K <- length(unique(pd$celltype))
cl <- cutree(hclust(d), k = K)
new_name <- sapply(1:K, function(k){
tmp <- table(factor(names(cl)[cl==k], levels = unique(pd$celltype)))
null <- table(factor(pd$celltype, levels = unique(pd$celltype)))
names(tmp)[which.max(tmp/null)] #max observed to expected
})
new_name <- make.unique(new_name)
pd$ind_celltype <- new_name[cl]
```
```{r, echo=FALSE, include=FALSE}
rm(tmp2); gc()
```
## Exploratory Data Analysis
First we make a boxplot of the total counts stratified by celltype annotated using count.
```{r}
library(tidyverse)
dslabs::ds_theme_set()
data.frame(n = colSums(y), batch = pd$orig.ident, cell_type = pd$celltype) %>%
mutate(cell_type = reorder(cell_type, n, median)) %>%
ggplot(aes(cell_type, n, fill = batch)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("data driven annotation")
```
Next we make a boxplot of the total counts stratified by the new clusters made using exclusively the protein data.
Both plot show strong evidence that cell type have different total number of UMI.
```{r}
data.frame(n = colSums(y), batch = pd$orig.ident, cell_type = pd$ind_celltype) %>%
mutate(cell_type = reorder(cell_type, n, median)) %>%
ggplot(aes(cell_type, n, fill = batch)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("protein driven annotation")
```
Next we do the same for the proportion of 0s.
```{r}
data.frame(prop_0 = colMeans(y==0), batch = pd$orig.ident, cell_type = pd$celltype) %>%
mutate(cell_type = reorder(cell_type, prop_0, median)) %>%
ggplot(aes(cell_type, prop_0, fill = batch)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("data driven annotation")
```
Now with the independent cluster
```{r}
data.frame(prop_0 = colMeans(y==0), batch = pd$orig.ident, cell_type = pd$ind_celltype) %>%
mutate(cell_type = reorder(cell_type, prop_0, median)) %>%
ggplot(aes(cell_type, prop_0, fill = batch)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("protein driven annotation")
```
### Distribution
We now are going to compare the distributions of each cell type. To do this we will compute the proportions of UMIs coming from each gene for each cell type. Note that we add up all the cells of the same cell type. Also note that the difference in total count will not show up here directly as we are computing proportions that add up to 1.
```{r}
inds <- split(1:nrow(pd), pd$celltype)
pis <- sapply(inds, function(ind){
rowSums(y[,ind])/sum(y[,ind])
})
```
We are going to make MA-plot versions of qqplots. The y-axis is the ratio of the quantiles and the x-axis is the average quantile. If these cells have the same distribution points will be on the horizontal line $y=1$.
We show 16 examples.
```{r, warning=FALSE}
tmp <- gtools::combinations(ncol(pis),2)
tmp <- tmp[sample(nrow(tmp),16),]
maplot <- function(x, y, ...)
plot(sqrt(x*y), ifelse(y==0 & x==0, 1, y/x), ...)
rafalib::mypar(4,4)
apply(tmp, 1, function(z){
i <- z[1]; j<-z[2]
maplot(
quantile(pis[,i], seq(0,1,len=250)),
quantile(pis[,j], seq(0,1,len=250)),
log="xy",
xlab=colnames(pis)[i], ylab=colnames(pis)[j],
ylim=c(0.5,2), type="b")
abline(h=1)
})
```
These look relatively straight except for the very highly expressed genes.
### FACS
Now we repeat the analysis for FACSorted samples.
```{r}
load("rdas/four_pbmcs.rda")
load("rdas/pbmcs_labels.rda")
data.frame(n = colSums(pbmcs_reference), cell_type = pbmcs_reference_labels) %>%
mutate(cell_type = reorder(cell_type, n, median)) %>%
ggplot(aes(cell_type, n)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("Annotation from FACS")
```
We see a cell-specific effect. But let's compare to the previous data:
```{r}
data.frame(n = colSums(y), batch = pd$orig.ident, cell_type = pd$celltype) %>%
filter(cell_type %in% c("CD14 Mono", "CD4 Naive","CD8 Naive","CD16+ NK")) %>%
mutate(cell_type = reorder(cell_type, n, median)) %>%
ggplot(aes(cell_type, n)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("data driven annotation")
```
One concern is that it is not following the same pattern.
Here are the same plots for proportion of 0s
```{r}
data.frame(prop_0 = colMeans(pbmcs_reference==0), cell_type = pbmcs_reference_labels) %>%
mutate(cell_type = reorder(cell_type, prop_0, median)) %>%
ggplot(aes(cell_type, prop_0)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("Annotation from FACS")
```
Here are the previous results
```{r}
data.frame(prop_0 = colMeans(y==0), batch = pd$orig.ident, cell_type = pd$celltype) %>%
filter(cell_type %in% c("CD14 Mono", "CD4 Naive","CD8 Naive","CD16+ NK")) %>%
mutate(cell_type = reorder(cell_type, prop_0, median)) %>%
ggplot(aes(cell_type, prop_0)) +
geom_boxplot(color = I("black")) + coord_flip() + scale_y_log10() +
ggtitle("data driven annotation")
```
Now let's look at the distributions as before:
```{r, warning=FALSE}
inds <- split(seq_along(pbmcs_reference_labels), pbmcs_reference_labels)
pis <- sapply(inds, function(ind) rowSums(pbmcs_reference[,ind])/sum(pbmcs_reference[,ind]))
rafalib::mypar(3,2)
maplot <- function(x, y, ...)
plot(sqrt(x*y), y/x, ...)
for(i in 1:(ncol(pis)-1)){
for(j in (i+1):ncol(pis)){
maplot(
quantile(pis[,i], seq(0,1,len=250)),
quantile(pis[,j], seq(0,1,len=250)),
log="xy",
xlab=colnames(pis)[i], ylab=colnames(pis)[j],
ylim=c(0.5,2), type="b")
abline(h=1)
}}
```