From 792e3b396a2d0f747606abf6690a149fa0d30627 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 23 Jan 2025 16:24:26 -0600 Subject: [PATCH 1/8] setup function for data frame prep --- .../utils/setup-functions.R | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R diff --git a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R new file mode 100644 index 000000000..220b64449 --- /dev/null +++ b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R @@ -0,0 +1,79 @@ +# These functions are used in `celltype-exploration.Rmd` +# They are used for reading in and setting up the cell type results + +# read in and prepare combined data frame +#' Combine workflow results into a single data frame +#' +#' @param sce Processed SingleCellExperiment object with UMAP embeddings +#' @param singler_df Data frame with results from `aucell-singler-annotation.sh` workflow +#' @param cluster_df Data frame with results from `evaluate-clusters.sh` workflow +#' @param aucell_df Data frame with results from `run-aucell-ews-signatures.sh` workflow +#' @param cluster_nn Value of nearest neighbors to use for cluster results. Default is 20. +#' @param cluster_res Value of resolution to use for cluster results. Default is 20. +#' +prep_results <- function( + sce, + singler_df, + cluster_df, + aucell_df, + cluster_nn = 20, + cluster_res = 0.5 +) { + + ## grab UMAP + umap_df <- sce |> + scuttle::makePerCellDF(use.dimred = "UMAP") |> + # replace UMAP.1 with UMAP1 and get rid of excess columns + dplyr::select(barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2) + + ## prep singler data + singler_df <- singler_df |> + dplyr::mutate( + # first grab anything that is tumor and label it tumor + # NA should be unknown + singler_annotation = dplyr::case_when( + stringr::str_detect(singler_annotation, "tumor") ~ "tumor", + is.na(singler_annotation) ~ "unknown", # make sure to separate out unknown labels + .default = singler_annotation + ) |> + forcats::fct_relevel("tumor", after = 0), + # get the top cell types for plotting later + singler_lumped = singler_annotation |> + forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |> + forcats::fct_infreq() |> + forcats::fct_relevel("All remaining cell types", after = Inf) + ) + + ## prep cluster data + cluster_df <- cluster_df |> + # filter to the clustering results we want to use + dplyr::filter( + cluster_method == "leiden_mod", + nn == cluster_nn, + resolution == cluster_res + ) |> + dplyr::select( + barcodes = cell_id, + cluster + ) + + ## prep aucell + aucell_wide_df <- aucell_df |> + dplyr::mutate( + assignment = auc > auc_threshold + ) |> + tidyr::pivot_wider( + id_cols = "barcodes", + names_from = "gene_set", + values_from = c(auc, assignment) + ) + + ## combine into one data frame + all_results_df <- umap_df |> + dplyr::left_join(singler_df, by = c("barcodes")) |> + dplyr::left_join(cluster_df, by = c("barcodes")) |> + dplyr::left_join(aucell_wide_df, by = c("barcodes")) + + return(all_results_df) + +} From 4b6581e3eb39944adc29b50d3e4a5947b8b1d467 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 23 Jan 2025 16:24:33 -0600 Subject: [PATCH 2/8] initiate "guide" notebook --- .../celltype-exploration.Rmd | 199 ++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd diff --git a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd new file mode 100644 index 000000000..5395cc10b --- /dev/null +++ b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd @@ -0,0 +1,199 @@ +--- +title: "Template notebook for validating cell type assignments for an individual library in SCPCP000015" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 3 + code_folding: "hide" +params: + sample_id: "SCPCS000490" + library_id: "SCPCL000822" + cluster_nn: 20 + cluster_res: 0.5 +--- + +This notebook is meant to be a guide for compiling "final" cell type annotations for an individual library in `SCPCP000015`. +Results from `aucell-singler-annotation.sh`, `evaulate-clusters.sh`, and `run-aucell-ews-signatures.sh` are all combined and used to assign cell type annotations on a case by case basis. + +Instructions for using this guide: + +1. Ensure that you have a local copy of the results from `aucell-singler-annotation.sh`, `evaluate-clusters.sh` and `run-aucell-ews-signatures.sh` saved to `results`. +2. Copy the contents of this notebook to a new notebook titled `_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. +3. Replace the `sample_id` and `library_id` with the correct IDs in the `params` list. +4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. +All clusters used will be calculated with the Leiden algorithm and the modularity objective function. +To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. +5. Run through the notebook and update any sections of the notebook marked with `**Manual exploration**`. +6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. + +## Setup + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(SingleCellExperiment) + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_classic() +) + +# set seed +set.seed(2024) +``` + + +```{r base paths} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) + +# The current data directory, found within the repository base directory +data_dir <- file.path(repository_base, "data", "current", "SCPCP000015") + +# The path to this module +module_base <- file.path(repository_base, "analyses", "cell-type-ewings") +``` + +```{r} +# path to sce +sce_file <- file.path(data_dir, params$sample_id, glue::glue("{params$library_id}_processed.rds")) + +# path to workflow results +workflow_results_dir <- file.path(module_base, "results") + +singler_results_dir <- file.path(workflow_results_dir, "aucell_singler_annotation", params$sample_id) +singler_results_file <- file.path(singler_results_dir, + glue::glue("{params$library_id}_singler-classifications.tsv")) + +cluster_results_dir <- file.path(workflow_results_dir, "clustering", params$sample_id) +cluster_results_file <- file.path(cluster_results_dir, + glue::glue("{params$library_id}_cluster-results.tsv")) + +aucell_results_dir <- file.path(workflow_results_dir, "aucell-ews-signatures", params$sample_id) +aucell_results_file <- file.path(aucell_results_dir, + glue::glue("{params$library_id}_auc-ews-gene-signatures.tsv")) + +# small gene sets +visser_marker_genes_file <- file.path(module_base, "references", "visser-all-marker-genes.tsv") +cell_state_genes_file <- file.path(module_base, "references", "tumor-cell-state-markers.tsv") +``` + +```{r} +# output file to save final annotations +results_dir <- file.path(module_base, "results", "final-annotations") +output_file <- file.path(results_dir, glue::glue("{params$library_id}_celltype-annotations.tsv")) +``` + + +```{r} +# source in setup functions prep_results() +setup_functions <- file.path(module_base, "template_notebooks", "utils", "setup-functions.R") +source(setup_functions) + +# source in validation functions calculate_mean_markers() +validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R") +source(validation_functions) +``` + +```{r} +stopifnot( + "sce file does not exist" = file.exists(sce_file), + "singler results file does not exist" = file.exists(singler_results_file), + "cluster results file does not exist" = file.exists(cluster_results_file), + "aucell results file does not exist" = file.exists(aucell_results_file) +) +``` + + +```{r, message=FALSE} +# read in sce +sce <- readr::read_rds(sce_file) + +# read in workflow results +singler_df <- readr::read_tsv(singler_results_file) +cluster_df <- readr::read_tsv(cluster_results_file) +aucell_df <- readr::read_tsv(aucell_results_file) + +# read in marker genes and combine into one list +visser_markers_df <- readr::read_tsv(visser_marker_genes_file) |> + dplyr::select(cell_type, ensembl_gene_id) |> + unique() + +cell_state_markers_df <- readr::read_tsv(cell_state_genes_file) |> + dplyr::select(cell_type = cell_state, ensembl_gene_id) + +all_markers_df <- dplyr::bind_rows(list(visser_markers_df, cell_state_markers_df)) +``` + +## Prepare data for plotting + +```{r} +all_results_df <- prep_results( + sce, + singler_df, + cluster_df, + aucell_df, + cluster_nn = params$cluster_nn, + cluster_res = params$cluster_res + ) + +# calculate gene set means for each cell type/cell state +cell_types <- unique(all_markers_df$cell_type) + +# get the mean expression of all genes for each cell state +gene_exp_df <- cell_types |> + purrr::map(\(type){ + calculate_mean_markers(all_markers_df, sce, type, cell_type) + }) |> + purrr::reduce(dplyr::inner_join, by = "barcodes") + +all_info_df <- all_results_df |> + dplyr::left_join(gene_exp_df, by = "barcodes") +``` + +## Summary of workflow results + +TODO: Insert plots that will summarize findings from each of the workflows +- UMAPs of SingleR, clusters, AUC values and custom gene set means +- Density plots by cluster of AUC values and custom gene set means +- Maybe heatmaps with cluster annotation of AUC scores and custom gene set means + +## Re-cluster tumor cells **Manual exploration** + + + +TODO: Functions for re-clustering tumor cells +Show the same plots across the tumor only clusters and assign tumor cell states to each cluster + +## Additional exploration **Manual exploration** + + + +## Validate final tumor and normal annotation **Manual exploration** + + + +TODO: Insert plots that will be useful for validation (UMAPs, heatmaps, density plots) + +## Prepare annotations **Manual exploration** + + + +TODO: Code and instructions for exporting such as what columns should be named. + +## Session info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + From 65d4bdd3f282a20afead565ce03de07789307277 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Thu, 23 Jan 2025 16:31:10 -0600 Subject: [PATCH 3/8] add to readme --- .../cell-type-ewings/template_notebooks/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/analyses/cell-type-ewings/template_notebooks/README.md b/analyses/cell-type-ewings/template_notebooks/README.md index 9b5d63fef..55622a7d6 100644 --- a/analyses/cell-type-ewings/template_notebooks/README.md +++ b/analyses/cell-type-ewings/template_notebooks/README.md @@ -5,3 +5,17 @@ This folder contains any template notebooks that are rendered as part of a workf 1. `cnv-workflow`: This folder contains all template notebooks used in `cnv-annotation.sh`. 2. `auc-singler-workflow`: This folder contains all template notebooks used in `auc-singler-annotation.sh`. 3. `clustering-workflow`: This folder contains all template notebooks used in `evaluate-clusters.sh`. + +## Guide notebook for assigning and evaluating cell type annotations + +The `celltype-exploration.Rmd` notebook is meant to be used as a guide for assigning and evaluating the final cell type annotations for each library in `SCPCP000015`. +Instructions for using this guide: + +1. Ensure that you have a local copy of the results from `aucell-singler-annotation.sh`, `evaluate-clusters.sh` and `run-aucell-ews-signatures.sh` saved to `results`. +2. Copy the contents of this notebook to a new notebook titled `_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. +3. Replace the `sample_id` and `library_id` with the correct IDs in the `params` list. +4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. +All clusters used will be calculated with the Leiden algorithm and the modularity objective function. +To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. +5. Run through the notebook and update any sections of the notebook marked with `**Manual exploration**`. +6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. From a5154a5257668554fd82ef3413074c3068c71338 Mon Sep 17 00:00:00 2001 From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> Date: Fri, 24 Jan 2025 09:23:08 -0600 Subject: [PATCH 4/8] Apply suggestions from code review Co-authored-by: Stephanie Spielman --- analyses/cell-type-ewings/template_notebooks/README.md | 4 ++-- .../template_notebooks/celltype-exploration.Rmd | 1 - .../template_notebooks/utils/setup-functions.R | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/analyses/cell-type-ewings/template_notebooks/README.md b/analyses/cell-type-ewings/template_notebooks/README.md index 55622a7d6..1700cf6f9 100644 --- a/analyses/cell-type-ewings/template_notebooks/README.md +++ b/analyses/cell-type-ewings/template_notebooks/README.md @@ -15,7 +15,7 @@ Instructions for using this guide: 2. Copy the contents of this notebook to a new notebook titled `_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. 3. Replace the `sample_id` and `library_id` with the correct IDs in the `params` list. 4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. -All clusters used will be calculated with the Leiden algorithm and the modularity objective function. -To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. +All clusters used were calculated with the Leiden algorithm and the modularity objective function. +To modify the nearest neighbors (default: 20) and resolution (default: 0.5) parameters use the `cluster_nn` and `cluster_res` params. 5. Run through the notebook and update any sections of the notebook marked with `**Manual exploration**`. 6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. diff --git a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd index 5395cc10b..2f628f374 100644 --- a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd @@ -141,7 +141,6 @@ all_results_df <- prep_results( cluster_res = params$cluster_res ) -# calculate gene set means for each cell type/cell state cell_types <- unique(all_markers_df$cell_type) # get the mean expression of all genes for each cell state diff --git a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R index 220b64449..a1c1a349a 100644 --- a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R +++ b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R @@ -1,7 +1,6 @@ # These functions are used in `celltype-exploration.Rmd` # They are used for reading in and setting up the cell type results -# read in and prepare combined data frame #' Combine workflow results into a single data frame #' #' @param sce Processed SingleCellExperiment object with UMAP embeddings From ae48004f6c8e2afd394781b9f9f10d41ef8c8c67 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 24 Jan 2025 09:27:31 -0600 Subject: [PATCH 5/8] remove instructions from readme --- .../template_notebooks/README.md | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/analyses/cell-type-ewings/template_notebooks/README.md b/analyses/cell-type-ewings/template_notebooks/README.md index 1700cf6f9..29b6e8884 100644 --- a/analyses/cell-type-ewings/template_notebooks/README.md +++ b/analyses/cell-type-ewings/template_notebooks/README.md @@ -5,17 +5,5 @@ This folder contains any template notebooks that are rendered as part of a workf 1. `cnv-workflow`: This folder contains all template notebooks used in `cnv-annotation.sh`. 2. `auc-singler-workflow`: This folder contains all template notebooks used in `auc-singler-annotation.sh`. 3. `clustering-workflow`: This folder contains all template notebooks used in `evaluate-clusters.sh`. - -## Guide notebook for assigning and evaluating cell type annotations - -The `celltype-exploration.Rmd` notebook is meant to be used as a guide for assigning and evaluating the final cell type annotations for each library in `SCPCP000015`. -Instructions for using this guide: - -1. Ensure that you have a local copy of the results from `aucell-singler-annotation.sh`, `evaluate-clusters.sh` and `run-aucell-ews-signatures.sh` saved to `results`. -2. Copy the contents of this notebook to a new notebook titled `_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. -3. Replace the `sample_id` and `library_id` with the correct IDs in the `params` list. -4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. -All clusters used were calculated with the Leiden algorithm and the modularity objective function. -To modify the nearest neighbors (default: 20) and resolution (default: 0.5) parameters use the `cluster_nn` and `cluster_res` params. -5. Run through the notebook and update any sections of the notebook marked with `**Manual exploration**`. -6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. +4. `celltype-exploration.Rmd`: This notebook is meant to be used as a guide for assigning and evaluating the final cell type annotations for each library in `SCPCP000015`. +Full instructions on how to use this notebook as a guide can be found at the beginning of the notebook itself. From 44f569203a7ef43da039cf4378f93e90b83b3dca Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 24 Jan 2025 09:41:31 -0600 Subject: [PATCH 6/8] note about using leiden --- .../cell-type-ewings/template_notebooks/utils/setup-functions.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R index a1c1a349a..ada3f5276 100644 --- a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R +++ b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R @@ -2,6 +2,8 @@ # They are used for reading in and setting up the cell type results #' Combine workflow results into a single data frame +#' +#' Note that this function will only include clustering results from Leiden with modularity in the output #' #' @param sce Processed SingleCellExperiment object with UMAP embeddings #' @param singler_df Data frame with results from `aucell-singler-annotation.sh` workflow From c7890330790d4e685620cdda6cd471746ab48eb2 Mon Sep 17 00:00:00 2001 From: Ally Hawkins Date: Fri, 24 Jan 2025 09:41:47 -0600 Subject: [PATCH 7/8] some minor review responses --- .../template_notebooks/celltype-exploration.Rmd | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd index 2f628f374..5527cb22a 100644 --- a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd @@ -21,7 +21,7 @@ Instructions for using this guide: 1. Ensure that you have a local copy of the results from `aucell-singler-annotation.sh`, `evaluate-clusters.sh` and `run-aucell-ews-signatures.sh` saved to `results`. 2. Copy the contents of this notebook to a new notebook titled `_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. -3. Replace the `sample_id` and `library_id` with the correct IDs in the `params` list. +3. Update the `title` in the `yaml` section and replace the `sample_id` and `library_id` with the correct IDs in the `params` list. 4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. All clusters used will be calculated with the Leiden algorithm and the modularity objective function. To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. @@ -161,7 +161,7 @@ TODO: Insert plots that will summarize findings from each of the workflows - Density plots by cluster of AUC values and custom gene set means - Maybe heatmaps with cluster annotation of AUC scores and custom gene set means -## Re-cluster tumor cells **Manual exploration** +## Re-cluster tumor cells {.manual-exploration} TODO: Functions for re-clustering tumor cells Show the same plots across the tumor only clusters and assign tumor cell states to each cluster -## Additional exploration **Manual exploration** +## Additional exploration {.manual-exploration} -## Validate final tumor and normal annotation **Manual exploration** +## Validate final tumor and normal annotation {.manual-exploration} TODO: Insert plots that will be useful for validation (UMAPs, heatmaps, density plots) -## Prepare annotations **Manual exploration** +## Prepare annotations {.manual-exploration} From 5fec5404b0ba00b7389fd29a60ee0b1d2b4120a6 Mon Sep 17 00:00:00 2001 From: Ally Hawkins <54039191+allyhawkins@users.noreply.github.com> Date: Fri, 24 Jan 2025 09:52:52 -0600 Subject: [PATCH 8/8] Update instructions Co-authored-by: Stephanie Spielman --- .../template_notebooks/celltype-exploration.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd index 5527cb22a..61ffce69e 100644 --- a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd +++ b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd @@ -25,7 +25,7 @@ Instructions for using this guide: 4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. All clusters used will be calculated with the Leiden algorithm and the modularity objective function. To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. -5. Run through the notebook and update any sections of the notebook marked with `**Manual exploration**`. +5. Run through the notebook and update any sections of the notebook marked with the `{.manual-exploration}` tag. 6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. ## Setup