Merge pull request #1001 from allyhawkins/allyhawkins/initiate-final-…

…annotation-template Initiate "guide" notebook for validating cell type assignments
AlexsLemonade · Jan 27, 2025 · e7da87c · e7da87c
2 parents d3faa7c + 5fec540
commit e7da87c
Show file tree

Hide file tree

Showing 3 changed files with 280 additions and 0 deletions.
diff --git a/analyses/cell-type-ewings/template_notebooks/README.md b/analyses/cell-type-ewings/template_notebooks/README.md
@@ -6,3 +6,5 @@ This folder contains any template notebooks that are rendered as part of a workf
 **NOTE:** This workflow is no longer used in the cell type annotation analysis, has been removed from CI, and is no longer maintained! 
 2. `auc-singler-workflow`: This folder contains all template notebooks used in `auc-singler-annotation.sh`.
 3. `clustering-workflow`: This folder contains all template notebooks used in `evaluate-clusters.sh`. 
+4. `celltype-exploration.Rmd`: This notebook is meant to be used as a guide for assigning and evaluating the final cell type annotations for each library in `SCPCP000015`. 
+Full instructions on how to use this notebook as a guide can be found at the beginning of the notebook itself. 
diff --git a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd
@@ -0,0 +1,198 @@
+---
+title: "Template notebook for validating cell type assignments for an individual library in SCPCP000015"
+author: Ally Hawkins
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: true
+    toc_depth: 3
+    code_folding: "hide"
+params:
+  sample_id: "SCPCS000490"
+  library_id: "SCPCL000822" 
+  cluster_nn: 20
+  cluster_res: 0.5
+---
+
+This notebook is meant to be a guide for compiling "final" cell type annotations for an individual library in `SCPCP000015`. 
+Results from `aucell-singler-annotation.sh`, `evaulate-clusters.sh`, and `run-aucell-ews-signatures.sh` are all combined and used to assign cell type annotations on a case by case basis. 
+
+Instructions for using this guide: 
+
+1. Ensure that you have a local copy of the results from `aucell-singler-annotation.sh`, `evaluate-clusters.sh` and `run-aucell-ews-signatures.sh` saved to `results`. 
+2. Copy the contents of this notebook to a new notebook titled `<library_id>_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. 
+3. Update the `title` in the `yaml` section and replace the `sample_id` and `library_id` with the correct IDs in the `params` list. 
+4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. 
+All clusters used will be calculated with the Leiden algorithm and the modularity objective function. 
+To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. 
+5. Run through the notebook and update any sections of the notebook marked with the `{.manual-exploration}` tag. 
+6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. 
+
+## Setup
+
+```{r packages}
+suppressPackageStartupMessages({
+  # load required packages
+  library(SingleCellExperiment)
+  library(ggplot2)
+})
+
+# Set default ggplot theme
+theme_set(
+  theme_classic()
+)
+
+# set seed
+set.seed(2024)
+```
+
+
+```{r base paths}
+# The base path for the OpenScPCA repository, found by its (hidden) .git directory
+repository_base <- rprojroot::find_root(rprojroot::is_git_root)
+
+# The current data directory, found within the repository base directory
+data_dir <- file.path(repository_base, "data", "current", "SCPCP000015")
+
+# The path to this module
+module_base <- file.path(repository_base, "analyses", "cell-type-ewings") 
+```
+
+```{r}
+# path to sce 
+sce_file <- file.path(data_dir, params$sample_id, glue::glue("{params$library_id}_processed.rds"))
+
+# path to workflow results
+workflow_results_dir <- file.path(module_base, "results")
+
+singler_results_dir <- file.path(workflow_results_dir, "aucell_singler_annotation", params$sample_id)
+singler_results_file <- file.path(singler_results_dir,
+                                  glue::glue("{params$library_id}_singler-classifications.tsv"))
+
+cluster_results_dir <- file.path(workflow_results_dir, "clustering", params$sample_id)
+cluster_results_file <- file.path(cluster_results_dir,
+                                  glue::glue("{params$library_id}_cluster-results.tsv"))
+
+aucell_results_dir <- file.path(workflow_results_dir, "aucell-ews-signatures", params$sample_id)
+aucell_results_file <- file.path(aucell_results_dir,
+                                 glue::glue("{params$library_id}_auc-ews-gene-signatures.tsv"))
+
+# small gene sets
+visser_marker_genes_file <- file.path(module_base, "references", "visser-all-marker-genes.tsv")
+cell_state_genes_file <- file.path(module_base, "references", "tumor-cell-state-markers.tsv")
+```
+
+```{r}
+# output file to save final annotations 
+results_dir <- file.path(module_base, "results", "final-annotations")
+output_file <- file.path(results_dir, glue::glue("{params$library_id}_celltype-annotations.tsv"))
+```
+
+
+```{r}
+# source in setup functions prep_results()
+setup_functions <- file.path(module_base, "template_notebooks", "utils", "setup-functions.R")
+source(setup_functions)
+
+# source in validation functions calculate_mean_markers()
+validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R")
+source(validation_functions)
+```
+
+```{r}
+stopifnot(
+  "sce file does not exist" = file.exists(sce_file),
+  "singler results file does not exist" = file.exists(singler_results_file),
+  "cluster results file does not exist" = file.exists(cluster_results_file),
+  "aucell results file does not exist" = file.exists(aucell_results_file)
+)
+```
+
+
+```{r, message=FALSE}
+# read in sce
+sce <- readr::read_rds(sce_file)
+
+# read in workflow results
+singler_df <- readr::read_tsv(singler_results_file)
+cluster_df <- readr::read_tsv(cluster_results_file)
+aucell_df <- readr::read_tsv(aucell_results_file)
+
+# read in marker genes and combine into one list 
+visser_markers_df <- readr::read_tsv(visser_marker_genes_file) |> 
+  dplyr::select(cell_type, ensembl_gene_id) |> 
+  unique()
+  
+cell_state_markers_df <- readr::read_tsv(cell_state_genes_file) |> 
+  dplyr::select(cell_type = cell_state, ensembl_gene_id)
+
+all_markers_df <- dplyr::bind_rows(list(visser_markers_df, cell_state_markers_df))
+```
+
+## Prepare data for plotting
+
+```{r}
+all_results_df <- prep_results(
+  sce, 
+  singler_df, 
+  cluster_df, 
+  aucell_df,
+  cluster_nn = params$cluster_nn,
+  cluster_res = params$cluster_res
+  )
+  
+cell_types <- unique(all_markers_df$cell_type)
+
+# get the mean expression of all genes for each cell state
+gene_exp_df <- cell_types |>
+  purrr::map(\(type){
+    calculate_mean_markers(all_markers_df, sce, type, cell_type)
+  }) |>
+  purrr::reduce(dplyr::inner_join, by = "barcodes")
+
+all_info_df <- all_results_df |> 
+  dplyr::left_join(gene_exp_df, by = "barcodes")
+```
+
+## Summary of workflow results
+
+TODO: Insert plots that will summarize findings from each of the workflows
+- UMAPs of SingleR, clusters, AUC values and custom gene set means 
+- Density plots by cluster of AUC values and custom gene set means
+- Maybe heatmaps with cluster annotation of AUC scores and custom gene set means 
+
+## Re-cluster tumor cells {.manual-exploration}
+
+<!-- Use this section to label tumor cells based on the above findings. 
+Any cells that are labeled as tumor will then be re-clustered and plots showing only tumor cells
+can be created to identify tumor cell states --> 
+
+TODO: Functions for re-clustering tumor cells
+Show the same plots across the tumor only clusters and assign tumor cell states to each cluster
+
+## Additional exploration {.manual-exploration}
+
+<!--This section is for any additional exploration that may be needed to finalize annotations for this library. 
+If not using, please delete this section.
+For example, here you may want to dive into the normal cell types and make adjustments as needed
+--> 
+
+## Validate final tumor and normal annotation {.manual-exploration}
+
+<!-- This section should be used to update the assignments --> 
+
+TODO: Insert plots that will be useful for validation (UMAPs, heatmaps, density plots)
+
+## Prepare annotations {.manual-exploration}
+
+<!-- This section should be used to create the final.final table with cell type annotations for export --> 
+
+TODO: Code and instructions for exporting such as what columns should be named. 
+
+## Session info 
+
+```{r session info}
+# record the versions of the packages used in this analysis and other environment information
+sessionInfo()
+```
+
diff --git a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R
@@ -0,0 +1,80 @@
+# These functions are used in `celltype-exploration.Rmd` 
+# They are used for reading in and setting up the cell type results
+
+#' Combine workflow results into a single data frame
+#' 
+#' Note that this function will only include clustering results from Leiden with modularity in the output
+#'
+#' @param sce Processed SingleCellExperiment object with UMAP embeddings 
+#' @param singler_df Data frame with results from `aucell-singler-annotation.sh` workflow
+#' @param cluster_df Data frame with results from `evaluate-clusters.sh` workflow
+#' @param aucell_df Data frame with results from `run-aucell-ews-signatures.sh` workflow
+#' @param cluster_nn Value of nearest neighbors to use for cluster results. Default is 20.
+#' @param cluster_res Value of resolution to use for cluster results. Default is 20.  
+#'
+prep_results <- function(
+    sce,
+    singler_df,
+    cluster_df,
+    aucell_df,
+    cluster_nn = 20,
+    cluster_res = 0.5
+) {
+
+  ## grab UMAP 
+  umap_df <- sce |>
+    scuttle::makePerCellDF(use.dimred = "UMAP") |>
+    # replace UMAP.1 with UMAP1 and get rid of excess columns
+    dplyr::select(barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2)
+
+  ## prep singler data
+  singler_df <- singler_df |> 
+    dplyr::mutate(
+      # first grab anything that is tumor and label it tumor
+      # NA should be unknown
+      singler_annotation = dplyr::case_when(
+        stringr::str_detect(singler_annotation, "tumor") ~ "tumor",
+        is.na(singler_annotation) ~ "unknown", # make sure to separate out unknown labels
+        .default = singler_annotation
+      ) |>
+        forcats::fct_relevel("tumor", after = 0),
+      # get the top cell types for plotting later
+      singler_lumped = singler_annotation |>
+        forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |>
+        forcats::fct_infreq() |>
+        forcats::fct_relevel("All remaining cell types", after = Inf)
+    )
+
+  ## prep cluster data 
+  cluster_df <- cluster_df |> 
+    # filter to the clustering results we want to use 
+    dplyr::filter(
+      cluster_method == "leiden_mod",
+      nn == cluster_nn,
+      resolution == cluster_res
+    ) |> 
+    dplyr::select(
+      barcodes = cell_id,
+      cluster
+    )
+
+  ## prep aucell 
+  aucell_wide_df <- aucell_df |> 
+    dplyr::mutate(
+      assignment = auc > auc_threshold
+    ) |> 
+    tidyr::pivot_wider(
+      id_cols = "barcodes",
+      names_from = "gene_set",
+      values_from = c(auc, assignment)
+    )
+
+  ## combine into one data frame 
+  all_results_df <- umap_df |> 
+    dplyr::left_join(singler_df, by = c("barcodes")) |> 
+    dplyr::left_join(cluster_df, by = c("barcodes")) |> 
+    dplyr::left_join(aucell_wide_df, by = c("barcodes"))
+
+  return(all_results_df)
+
+}