Merge pull request #596 from AlexsLemonade/allyhawkins/celltype-NAs

Account for mismatches between old cell typing results and current processed object and handle missing UMAPs
AlexsLemonade · Nov 27, 2023 · 01a45ca · 01a45ca
2 parents c6930fe + 7415240
commit 01a45ca
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 40 deletions.
diff --git a/bin/add_celltypes_to_sce.R b/bin/add_celltypes_to_sce.R
@@ -123,6 +123,24 @@ if (!is.null(opt$singler_results)) {
       )
   }
 
+  # account for any unclassified cells
+  # we need to do this before joining with SingleR results, because otherwise
+  # we won't be able to distinguish NA from SingleR vs. NA because it isn't present in SingleR results
+  # get list of barcodes in SCE but not in SingleR results
+  missing_barcodes <- setdiff(colnames(sce), annotations_df$barcodes)
+
+  # only if there are missing barcodes, append them to the annotations df
+  if (length(missing_barcodes) > 0) {
+    # create a data frame with unclassified barcodes
+    unclassified_df <- data.frame(
+      barcodes = missing_barcodes,
+      singler_celltype_annotation = "Unclassified cell"
+    )
+
+    # combine into one data frame with classified and unclassified cells
+    annotations_df <- dplyr::bind_rows(annotations_df, unclassified_df)
+  }
+
   # add annotations to colData
   new_coldata <- colData(sce) |>
     as.data.frame() |>
@@ -174,7 +192,9 @@ if (!is.null(opt$cellassign_predictions)) {
 
   # join by barcode to make sure assignments are in the right order
   celltype_assignments <- data.frame(barcode = sce$barcodes) |>
-    dplyr::left_join(celltype_assignments, by = "barcode")
+    dplyr::left_join(celltype_assignments, by = "barcode") |>
+    # any cells that are NA were not classified by cellassign
+    dplyr::mutate(celltype = ifelse(is.na(celltype), "Unclassified cell", celltype))
 
   # add cell type and prediction to colData
   sce$cellassign_celltype_annotation <- celltype_assignments$celltype

diff --git a/templates/qc_report/celltypes_qc.rmd b/templates/qc_report/celltypes_qc.rmd
@@ -105,7 +105,7 @@ plot_umap <- function(
       )
     ) +
     theme(
-      legend.position = "bottom", 
+      legend.position = "bottom",
       aspect.ratio = 1
     )
 }
@@ -137,7 +137,6 @@ glue::glue("
 ")
 ```
 
-
 ## Statistics
 
 ```{r, warning = FALSE}
@@ -146,6 +145,47 @@ celltype_df <- create_celltype_df(processed_sce)
 ```
 
 
+```{r, eval = (has_cellassign || has_singler), results='asis'}
+unclassified_methods <- c()
+
+# check for unclassified SingleR cells
+if (has_singler && any(celltype_df$singler_celltype_annotation == "Unclassified cell")) {
+  unclassified_methods <- c(unclassified_methods, "singler")
+
+  # remove all unclassified cells
+  celltype_df <- celltype_df |>
+    dplyr::filter(
+      singler_celltype_annotation != "Unclassified cell"
+    )
+}
+
+# check for unclassified cellassign cells
+if (has_cellassign && any(celltype_df$cellassign_celltype_annotation == "Unclassified cell")) {
+  unclassified_methods <- c(unclassified_methods, "cellassign")
+
+  # remove unclassified cells
+  celltype_df <- celltype_df |>
+    dplyr::filter(
+      cellassign_celltype_annotation != "Unclassified cell"
+    )
+}
+
+# print a warning if either singleR or cellAssign have any unclassified cells
+if (length(unclassified_methods) > 0) {
+  unclassified_methods <- stringr::str_flatten_comma(unclassified_methods, last = ", and ")
+  glue::glue("
+ <div class=\"alert alert-info\">
+
+  When reprocessing this library, old results from running {unclassified_methods} were used.
+  This means results were calculated with a slightly different set of cells.
+  Any cells that are missing have been annotated as `Unclassified cell` in the object and will not be shown in any plots.
+
+  </div>
+")
+}
+```
+
+
 ```{r, eval = has_submitter}
 knitr::asis_output('
 ### Submitter-provided cell type annotations\n
@@ -176,21 +216,26 @@ create_celltype_n_table(celltype_df, cellassign_celltype_annotation) |>
   format_celltype_n_table()
 ```
 
+
+```{r, eval = has_umap}
+knitr::asis_output(glue::glue("
 ## UMAPs
 
 In this section, we show UMAPs colored by clusters.
-Clusters were calculated using the graph-based `r metadata(processed_sce)$cluster_algorithm` algorithm with `r metadata(processed_sce)$cluster_weighting` weighting.
+Clusters were calculated using the graph-based {metadata(processed_sce)$cluster_algorithm} algorithm with  {metadata(processed_sce)$cluster_weighting} weighting.
+"))
+```
 
 
-```{r}
+```{r, eval = has_umap}
 # Create dataset for plotting UMAPs with lumped cell types
 umap_df <- lump_celltypes(celltype_df)
 ```
 
 
 <!-- First UMAP: clusters -->
 
-```{r, eval = has_multiplex, results='asis'}
+```{r, eval = has_umap && has_multiplex, results='asis'}
 glue::glue("
   <div class=\"alert alert-info\">
     This library contains multiple samples that have not been batch-corrected, which may confound clustering assignments.
@@ -200,7 +245,7 @@ glue::glue("
 ```
 
 
-```{r message=FALSE, warning=FALSE}
+```{r eval = has_umap, message=FALSE, warning=FALSE}
 clusters_plot <- plot_umap(
   umap_df,
   cluster,
@@ -220,15 +265,20 @@ if (length(levels(umap_df$cluster)) <= 8) {
 ```
 
 
-Next, we show UMAPs colored by cell types.
+```{r, eval = has_umap}
+knitr::asis_output(
+  'Next, we show UMAPs colored by cell types.
 A separate UMAP is shown for each cell typing method used.
 
 For legibility, only the seven most common cell types are shown.
 All other cell types are grouped together and labeled "All remaining cell types" (not to be confused with "Unknown cell type" which represents cells that could not be classified).
+  '
+)
+```
 
 <!-- Now, UMAPs of cell types, where present -->
 
-```{r eval = has_submitter, message=FALSE, warning=FALSE}
+```{r eval = has_submitter && has_umap, message=FALSE, warning=FALSE}
 plot_umap(
   umap_df,
   submitter_celltype_annotation_lumped,
@@ -240,7 +290,7 @@ plot_umap(
 ```
 
 
-```{r eval=has_singler, message=FALSE, warning=FALSE}
+```{r eval=has_singler && has_umap, message=FALSE, warning=FALSE}
 plot_umap(
   umap_df,
   singler_celltype_annotation_lumped,
@@ -252,7 +302,7 @@ plot_umap(
 ```
 
 
-```{r, eval = has_cellassign, message=FALSE, warning=FALSE}
+```{r, eval = has_cellassign && has_umap, message=FALSE, warning=FALSE}
 plot_umap(
   umap_df,
   cellassign_celltype_annotation_lumped,

diff --git a/templates/qc_report/celltypes_supplemental_report.rmd b/templates/qc_report/celltypes_supplemental_report.rmd
@@ -230,7 +230,7 @@ calculate_plot_height <- function(
     row_names,
     col_names,
     n_spacers,
-    spacer_size = heatmap_padding, 
+    spacer_size = heatmap_padding,
     min_height = 4) {
   # first, based on number of cells in row_names:
   #   6 cell types to an inch
@@ -244,7 +244,7 @@ calculate_plot_height <- function(
   # finally, add in any padding
   heat_height <- heat_height + n_spacers * spacer_size
 
-  return( 
+  return(
     max(c(min_height, heat_height))
   )
 }
@@ -261,6 +261,9 @@ has_singler <- "singler" %in% metadata(processed_sce)$celltype_methods
 has_cellassign <- "cellassign" %in% metadata(processed_sce)$celltype_methods
 has_submitter <- "submitter" %in% metadata(processed_sce)$celltype_methods
 
+# check for umap
+has_umap <- "UMAP" %in% reducedDimNames(processed_sce)
+
 # what celltypes are available?
 available_celltypes <- c(
   ifelse(has_submitter, "Submitter", NA),
@@ -709,10 +712,13 @@ density_bw <- 0.03
 #  save the maximum for determining geom_segment height
 y_max <- celltype_df$cellassign_max_prediction |>
   split(celltype_df$cellassign_celltype_annotation) |>
+  # make sure we get rid of any small groups
+  purrr::discard(\(x) sum(is.finite(x)) <= 2) |>
+  # remove any NA's that may have slipped in
   purrr::map_dbl(
-    \(x) max(density(x, bw = density_bw)$y)
+    \(x) max(density(x, bw = density_bw, na.rm = TRUE)$y)
   ) |>
-  max()
+  max(na.rm = TRUE)
 
 # add count to celltype_df for setting alpha and yend values
 celltype_df <- celltype_df |>

diff --git a/templates/qc_report/utils/celltype_functions.rmd b/templates/qc_report/utils/celltype_functions.rmd
@@ -12,17 +12,27 @@ library(SingleCellExperiment)
 #'
 #' @return `celltype_df` with column of cell types, as factors, for each annotation method
 create_celltype_df <- function(processed_sce) {
-  celltype_df <- processed_sce |>
-    scuttle::makePerCellDF(use.dimred = "UMAP") |>
-    # rename UMAP columns as needed to remove potential period added by `scuttle::makePerCellDF`
-    dplyr::rename_with(
-      \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP"),
-      starts_with("UMAP")
-    ) |>
+  # only incorporate UMAP coordinates if present
+  if ("UMAP" %in% reducedDimNames(processed_sce)) {
+    celltype_df <- processed_sce |>
+      scuttle::makePerCellDF(use.dimred = "UMAP") |>
+      # rename UMAP columns as needed to remove potential period added by `scuttle::makePerCellDF`
+      dplyr::rename_with(
+        \(x) stringr::str_replace(x, "^UMAP\\.", "UMAP"),
+        starts_with("UMAP")
+      )
+    # otherwise just grab the colData
+  } else {
+    celltype_df <- colData(processed_sce) |>
+      as.data.frame()
+  }
+
+  celltype_df <- celltype_df |>
     # only keep columns of interest
     dplyr::select(
       barcodes,
-      cluster,
+      # account for potentially missing columns
+      contains("cluster"),
       contains("UMAP"),
       contains("singler"),
       contains("cellassign"),
@@ -62,7 +72,6 @@ create_celltype_df <- function(processed_sce) {
 prepare_automated_annotation_values <- function(
     df,
     annotation_column) {
-
   unknown_string <- "Unknown cell type"
 
   df |>
@@ -75,10 +84,10 @@ prepare_automated_annotation_values <- function(
         # otherwise, keep it
         .default = {{ annotation_column }}
       ) |>
-      # order column by number of cells
-      forcats::fct_infreq() |>
-      # make "Unknown cell type" the last level
-      forcats::fct_relevel(unknown_string, after = Inf)
+        # order column by number of cells
+        forcats::fct_infreq() |>
+        # make "Unknown cell type" the last level
+        forcats::fct_relevel(unknown_string, after = Inf)
     )
 }
 
@@ -91,46 +100,44 @@ prepare_automated_annotation_values <- function(
 #'
 #' @return Updated data frame with the `annotation_column` reformatted
 prepare_submitter_annotation_values <- function(df) {
-    
   if (!("submitter_celltype_annotation" %in% names(df))) {
     stop("Could not process submitter annotation values; missing submitter column.")
   }
-  
+
   # string for submitter-excluded cell types
   submitter_excluded_string <- "Submitter-excluded"
-  
+
   # define baseline levels in descending order of frequency
   submitter_levels <- df$submitter_celltype_annotation |>
-    # that this does not actually order the final factor - this is just for 
+    # that this does not actually order the final factor - this is just for
     #  preparing to handle "unknown" levels
     forcats::fct_infreq() |>
     levels()
-    
+
   # strings which might be "unknown"-y
-  unknown_strings <- c("na", "nan", "null", "unknown",  "")
+  unknown_strings <- c("na", "nan", "null", "unknown", "")
 
   # remove non-word characters & make lower case
   # note by removing \\W, we match `<n/a>` and similar
   submitter_normalized_levels <- submitter_levels |>
     stringr::str_replace_all("\\W+", "") |>
     stringr::str_to_lower()
-  
+
   # determine which of those levels are unknowns
   unknown_levels <- submitter_levels[submitter_normalized_levels %in% unknown_strings]
-  
+
   # if submitter-excluded is present it should go at the VERY END regardless of count
   if (submitter_excluded_string %in% submitter_levels) {
     unknown_levels <- c(unknown_levels, submitter_excluded_string)
   }
-  
+
   # Move unknown levels to the end in order of frequency, followed by submitter-excluded
   df$submitter_celltype_annotation <- df$submitter_celltype_annotation |>
     # still need to actually arrange by frequency first
     forcats::fct_infreq() |>
     # move unknowns, in order, to the end
     forcats::fct_relevel(unknown_levels, after = Inf)
-  
-  return(df)  
+
+  return(df)
 }
 ```
-