Skip to content

Commit

Permalink
Merge pull request #596 from AlexsLemonade/allyhawkins/celltype-NAs
Browse files Browse the repository at this point in the history
Account for mismatches between old cell typing results and current processed object and handle missing UMAPs
  • Loading branch information
allyhawkins authored Nov 27, 2023
2 parents c6930fe + 7415240 commit 01a45ca
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 40 deletions.
22 changes: 21 additions & 1 deletion bin/add_celltypes_to_sce.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,24 @@ if (!is.null(opt$singler_results)) {
)
}

# account for any unclassified cells
# we need to do this before joining with SingleR results, because otherwise
# we won't be able to distinguish NA from SingleR vs. NA because it isn't present in SingleR results
# get list of barcodes in SCE but not in SingleR results
missing_barcodes <- setdiff(colnames(sce), annotations_df$barcodes)

# only if there are missing barcodes, append them to the annotations df
if (length(missing_barcodes) > 0) {
# create a data frame with unclassified barcodes
unclassified_df <- data.frame(
barcodes = missing_barcodes,
singler_celltype_annotation = "Unclassified cell"
)

# combine into one data frame with classified and unclassified cells
annotations_df <- dplyr::bind_rows(annotations_df, unclassified_df)
}

# add annotations to colData
new_coldata <- colData(sce) |>
as.data.frame() |>
Expand Down Expand Up @@ -174,7 +192,9 @@ if (!is.null(opt$cellassign_predictions)) {

# join by barcode to make sure assignments are in the right order
celltype_assignments <- data.frame(barcode = sce$barcodes) |>
dplyr::left_join(celltype_assignments, by = "barcode")
dplyr::left_join(celltype_assignments, by = "barcode") |>
# any cells that are NA were not classified by cellassign
dplyr::mutate(celltype = ifelse(is.na(celltype), "Unclassified cell", celltype))

# add cell type and prediction to colData
sce$cellassign_celltype_annotation <- celltype_assignments$celltype
Expand Down
70 changes: 60 additions & 10 deletions templates/qc_report/celltypes_qc.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ plot_umap <- function(
)
) +
theme(
legend.position = "bottom",
legend.position = "bottom",
aspect.ratio = 1
)
}
Expand Down Expand Up @@ -137,7 +137,6 @@ glue::glue("
")
```


## Statistics

```{r, warning = FALSE}
Expand All @@ -146,6 +145,47 @@ celltype_df <- create_celltype_df(processed_sce)
```


```{r, eval = (has_cellassign || has_singler), results='asis'}
unclassified_methods <- c()
# check for unclassified SingleR cells
if (has_singler && any(celltype_df$singler_celltype_annotation == "Unclassified cell")) {
unclassified_methods <- c(unclassified_methods, "singler")
# remove all unclassified cells
celltype_df <- celltype_df |>
dplyr::filter(
singler_celltype_annotation != "Unclassified cell"
)
}
# check for unclassified cellassign cells
if (has_cellassign && any(celltype_df$cellassign_celltype_annotation == "Unclassified cell")) {
unclassified_methods <- c(unclassified_methods, "cellassign")
# remove unclassified cells
celltype_df <- celltype_df |>
dplyr::filter(
cellassign_celltype_annotation != "Unclassified cell"
)
}
# print a warning if either singleR or cellAssign have any unclassified cells
if (length(unclassified_methods) > 0) {
unclassified_methods <- stringr::str_flatten_comma(unclassified_methods, last = ", and ")
glue::glue("
<div class=\"alert alert-info\">
When reprocessing this library, old results from running {unclassified_methods} were used.
This means results were calculated with a slightly different set of cells.
Any cells that are missing have been annotated as `Unclassified cell` in the object and will not be shown in any plots.
</div>
")
}
```


```{r, eval = has_submitter}
knitr::asis_output('
### Submitter-provided cell type annotations\n
Expand Down Expand Up @@ -176,21 +216,26 @@ create_celltype_n_table(celltype_df, cellassign_celltype_annotation) |>
format_celltype_n_table()
```


```{r, eval = has_umap}
knitr::asis_output(glue::glue("
## UMAPs
In this section, we show UMAPs colored by clusters.
Clusters were calculated using the graph-based `r metadata(processed_sce)$cluster_algorithm` algorithm with `r metadata(processed_sce)$cluster_weighting` weighting.
Clusters were calculated using the graph-based {metadata(processed_sce)$cluster_algorithm} algorithm with {metadata(processed_sce)$cluster_weighting} weighting.
"))
```


```{r}
```{r, eval = has_umap}
# Create dataset for plotting UMAPs with lumped cell types
umap_df <- lump_celltypes(celltype_df)
```


<!-- First UMAP: clusters -->

```{r, eval = has_multiplex, results='asis'}
```{r, eval = has_umap && has_multiplex, results='asis'}
glue::glue("
<div class=\"alert alert-info\">
This library contains multiple samples that have not been batch-corrected, which may confound clustering assignments.
Expand All @@ -200,7 +245,7 @@ glue::glue("
```


```{r message=FALSE, warning=FALSE}
```{r eval = has_umap, message=FALSE, warning=FALSE}
clusters_plot <- plot_umap(
umap_df,
cluster,
Expand All @@ -220,15 +265,20 @@ if (length(levels(umap_df$cluster)) <= 8) {
```


Next, we show UMAPs colored by cell types.
```{r, eval = has_umap}
knitr::asis_output(
'Next, we show UMAPs colored by cell types.
A separate UMAP is shown for each cell typing method used.
For legibility, only the seven most common cell types are shown.
All other cell types are grouped together and labeled "All remaining cell types" (not to be confused with "Unknown cell type" which represents cells that could not be classified).
'
)
```

<!-- Now, UMAPs of cell types, where present -->

```{r eval = has_submitter, message=FALSE, warning=FALSE}
```{r eval = has_submitter && has_umap, message=FALSE, warning=FALSE}
plot_umap(
umap_df,
submitter_celltype_annotation_lumped,
Expand All @@ -240,7 +290,7 @@ plot_umap(
```


```{r eval=has_singler, message=FALSE, warning=FALSE}
```{r eval=has_singler && has_umap, message=FALSE, warning=FALSE}
plot_umap(
umap_df,
singler_celltype_annotation_lumped,
Expand All @@ -252,7 +302,7 @@ plot_umap(
```


```{r, eval = has_cellassign, message=FALSE, warning=FALSE}
```{r, eval = has_cellassign && has_umap, message=FALSE, warning=FALSE}
plot_umap(
umap_df,
cellassign_celltype_annotation_lumped,
Expand Down
14 changes: 10 additions & 4 deletions templates/qc_report/celltypes_supplemental_report.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ calculate_plot_height <- function(
row_names,
col_names,
n_spacers,
spacer_size = heatmap_padding,
spacer_size = heatmap_padding,
min_height = 4) {
# first, based on number of cells in row_names:
# 6 cell types to an inch
Expand All @@ -244,7 +244,7 @@ calculate_plot_height <- function(
# finally, add in any padding
heat_height <- heat_height + n_spacers * spacer_size
return(
return(
max(c(min_height, heat_height))
)
}
Expand All @@ -261,6 +261,9 @@ has_singler <- "singler" %in% metadata(processed_sce)$celltype_methods
has_cellassign <- "cellassign" %in% metadata(processed_sce)$celltype_methods
has_submitter <- "submitter" %in% metadata(processed_sce)$celltype_methods
# check for umap
has_umap <- "UMAP" %in% reducedDimNames(processed_sce)
# what celltypes are available?
available_celltypes <- c(
ifelse(has_submitter, "Submitter", NA),
Expand Down Expand Up @@ -709,10 +712,13 @@ density_bw <- 0.03
# save the maximum for determining geom_segment height
y_max <- celltype_df$cellassign_max_prediction |>
split(celltype_df$cellassign_celltype_annotation) |>
# make sure we get rid of any small groups
purrr::discard(\(x) sum(is.finite(x)) <= 2) |>
# remove any NA's that may have slipped in
purrr::map_dbl(
\(x) max(density(x, bw = density_bw)$y)
\(x) max(density(x, bw = density_bw, na.rm = TRUE)$y)
) |>
max()
max(na.rm = TRUE)
# add count to celltype_df for setting alpha and yend values
celltype_df <- celltype_df |>
Expand Down
57 changes: 32 additions & 25 deletions templates/qc_report/utils/celltype_functions.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,27 @@ library(SingleCellExperiment)
#'
#' @return `celltype_df` with column of cell types, as factors, for each annotation method
create_celltype_df <- function(processed_sce) {
celltype_df <- processed_sce |>
scuttle::makePerCellDF(use.dimred = "UMAP") |>
# rename UMAP columns as needed to remove potential period added by `scuttle::makePerCellDF`
dplyr::rename_with(
\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP"),
starts_with("UMAP")
) |>
# only incorporate UMAP coordinates if present
if ("UMAP" %in% reducedDimNames(processed_sce)) {
celltype_df <- processed_sce |>
scuttle::makePerCellDF(use.dimred = "UMAP") |>
# rename UMAP columns as needed to remove potential period added by `scuttle::makePerCellDF`
dplyr::rename_with(
\(x) stringr::str_replace(x, "^UMAP\\.", "UMAP"),
starts_with("UMAP")
)
# otherwise just grab the colData
} else {
celltype_df <- colData(processed_sce) |>
as.data.frame()
}
celltype_df <- celltype_df |>
# only keep columns of interest
dplyr::select(
barcodes,
cluster,
# account for potentially missing columns
contains("cluster"),
contains("UMAP"),
contains("singler"),
contains("cellassign"),
Expand Down Expand Up @@ -62,7 +72,6 @@ create_celltype_df <- function(processed_sce) {
prepare_automated_annotation_values <- function(
df,
annotation_column) {
unknown_string <- "Unknown cell type"
df |>
Expand All @@ -75,10 +84,10 @@ prepare_automated_annotation_values <- function(
# otherwise, keep it
.default = {{ annotation_column }}
) |>
# order column by number of cells
forcats::fct_infreq() |>
# make "Unknown cell type" the last level
forcats::fct_relevel(unknown_string, after = Inf)
# order column by number of cells
forcats::fct_infreq() |>
# make "Unknown cell type" the last level
forcats::fct_relevel(unknown_string, after = Inf)
)
}
Expand All @@ -91,46 +100,44 @@ prepare_automated_annotation_values <- function(
#'
#' @return Updated data frame with the `annotation_column` reformatted
prepare_submitter_annotation_values <- function(df) {
if (!("submitter_celltype_annotation" %in% names(df))) {
stop("Could not process submitter annotation values; missing submitter column.")
}
# string for submitter-excluded cell types
submitter_excluded_string <- "Submitter-excluded"
# define baseline levels in descending order of frequency
submitter_levels <- df$submitter_celltype_annotation |>
# that this does not actually order the final factor - this is just for
# that this does not actually order the final factor - this is just for
# preparing to handle "unknown" levels
forcats::fct_infreq() |>
levels()
# strings which might be "unknown"-y
unknown_strings <- c("na", "nan", "null", "unknown", "")
unknown_strings <- c("na", "nan", "null", "unknown", "")
# remove non-word characters & make lower case
# note by removing \\W, we match `<n/a>` and similar
submitter_normalized_levels <- submitter_levels |>
stringr::str_replace_all("\\W+", "") |>
stringr::str_to_lower()
# determine which of those levels are unknowns
unknown_levels <- submitter_levels[submitter_normalized_levels %in% unknown_strings]
# if submitter-excluded is present it should go at the VERY END regardless of count
if (submitter_excluded_string %in% submitter_levels) {
unknown_levels <- c(unknown_levels, submitter_excluded_string)
}
# Move unknown levels to the end in order of frequency, followed by submitter-excluded
df$submitter_celltype_annotation <- df$submitter_celltype_annotation |>
# still need to actually arrange by frequency first
forcats::fct_infreq() |>
# move unknowns, in order, to the end
forcats::fct_relevel(unknown_levels, after = Inf)
return(df)
return(df)
}
```

0 comments on commit 01a45ca

Please sign in to comment.