diff --git a/analyses/cell-type-ewings/template_notebooks/README.md b/analyses/cell-type-ewings/template_notebooks/README.md index a63dfe8a8..ed855ca96 100644 --- a/analyses/cell-type-ewings/template_notebooks/README.md +++ b/analyses/cell-type-ewings/template_notebooks/README.md @@ -6,3 +6,5 @@ This folder contains any template notebooks that are rendered as part of a workf **NOTE:** This workflow is no longer used in the cell type annotation analysis, has been removed from CI, and is no longer maintained! 2. `auc-singler-workflow`: This folder contains all template notebooks used in `auc-singler-annotation.sh`. 3. `clustering-workflow`: This folder contains all template notebooks used in `evaluate-clusters.sh`. +4. `celltype-exploration.Rmd`: This notebook is meant to be used as a guide for assigning and evaluating the final cell type annotations for each library in `SCPCP000015`. +Full instructions on how to use this notebook as a guide can be found at the beginning of the notebook itself. diff --git a/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd new file mode 100644 index 000000000..61ffce69e --- /dev/null +++ b/analyses/cell-type-ewings/template_notebooks/celltype-exploration.Rmd @@ -0,0 +1,198 @@ +--- +title: "Template notebook for validating cell type assignments for an individual library in SCPCP000015" +author: Ally Hawkins +date: "`r Sys.Date()`" +output: + html_document: + toc: true + toc_depth: 3 + code_folding: "hide" +params: + sample_id: "SCPCS000490" + library_id: "SCPCL000822" + cluster_nn: 20 + cluster_res: 0.5 +--- + +This notebook is meant to be a guide for compiling "final" cell type annotations for an individual library in `SCPCP000015`. +Results from `aucell-singler-annotation.sh`, `evaulate-clusters.sh`, and `run-aucell-ews-signatures.sh` are all combined and used to assign cell type annotations on a case by case basis. + +Instructions for using this guide: + +1. Ensure that you have a local copy of the results from `aucell-singler-annotation.sh`, `evaluate-clusters.sh` and `run-aucell-ews-signatures.sh` saved to `results`. +2. Copy the contents of this notebook to a new notebook titled `_celltype-exploration.Rmd` and save in `exploratory_analysis/final_annotation_notebooks`. +3. Update the `title` in the `yaml` section and replace the `sample_id` and `library_id` with the correct IDs in the `params` list. +4. Optionally, you may choose to update the choices for clustering based on the results from `evaluate-clusters.sh`. +All clusters used will be calculated with the Leiden algorithm and the modularity objective function. +To modify the nearest neighbors (default: 20) and resolution (default: 0.5) chosen use the `cluster_nn` and `cluster_res` params. +5. Run through the notebook and update any sections of the notebook marked with the `{.manual-exploration}` tag. +6. Render the completed notebook which will produce the rendered `html` file and a TSV with cell type annotations for that library. + +## Setup + +```{r packages} +suppressPackageStartupMessages({ + # load required packages + library(SingleCellExperiment) + library(ggplot2) +}) + +# Set default ggplot theme +theme_set( + theme_classic() +) + +# set seed +set.seed(2024) +``` + + +```{r base paths} +# The base path for the OpenScPCA repository, found by its (hidden) .git directory +repository_base <- rprojroot::find_root(rprojroot::is_git_root) + +# The current data directory, found within the repository base directory +data_dir <- file.path(repository_base, "data", "current", "SCPCP000015") + +# The path to this module +module_base <- file.path(repository_base, "analyses", "cell-type-ewings") +``` + +```{r} +# path to sce +sce_file <- file.path(data_dir, params$sample_id, glue::glue("{params$library_id}_processed.rds")) + +# path to workflow results +workflow_results_dir <- file.path(module_base, "results") + +singler_results_dir <- file.path(workflow_results_dir, "aucell_singler_annotation", params$sample_id) +singler_results_file <- file.path(singler_results_dir, + glue::glue("{params$library_id}_singler-classifications.tsv")) + +cluster_results_dir <- file.path(workflow_results_dir, "clustering", params$sample_id) +cluster_results_file <- file.path(cluster_results_dir, + glue::glue("{params$library_id}_cluster-results.tsv")) + +aucell_results_dir <- file.path(workflow_results_dir, "aucell-ews-signatures", params$sample_id) +aucell_results_file <- file.path(aucell_results_dir, + glue::glue("{params$library_id}_auc-ews-gene-signatures.tsv")) + +# small gene sets +visser_marker_genes_file <- file.path(module_base, "references", "visser-all-marker-genes.tsv") +cell_state_genes_file <- file.path(module_base, "references", "tumor-cell-state-markers.tsv") +``` + +```{r} +# output file to save final annotations +results_dir <- file.path(module_base, "results", "final-annotations") +output_file <- file.path(results_dir, glue::glue("{params$library_id}_celltype-annotations.tsv")) +``` + + +```{r} +# source in setup functions prep_results() +setup_functions <- file.path(module_base, "template_notebooks", "utils", "setup-functions.R") +source(setup_functions) + +# source in validation functions calculate_mean_markers() +validation_functions <- file.path(module_base, "scripts", "utils", "tumor-validation-helpers.R") +source(validation_functions) +``` + +```{r} +stopifnot( + "sce file does not exist" = file.exists(sce_file), + "singler results file does not exist" = file.exists(singler_results_file), + "cluster results file does not exist" = file.exists(cluster_results_file), + "aucell results file does not exist" = file.exists(aucell_results_file) +) +``` + + +```{r, message=FALSE} +# read in sce +sce <- readr::read_rds(sce_file) + +# read in workflow results +singler_df <- readr::read_tsv(singler_results_file) +cluster_df <- readr::read_tsv(cluster_results_file) +aucell_df <- readr::read_tsv(aucell_results_file) + +# read in marker genes and combine into one list +visser_markers_df <- readr::read_tsv(visser_marker_genes_file) |> + dplyr::select(cell_type, ensembl_gene_id) |> + unique() + +cell_state_markers_df <- readr::read_tsv(cell_state_genes_file) |> + dplyr::select(cell_type = cell_state, ensembl_gene_id) + +all_markers_df <- dplyr::bind_rows(list(visser_markers_df, cell_state_markers_df)) +``` + +## Prepare data for plotting + +```{r} +all_results_df <- prep_results( + sce, + singler_df, + cluster_df, + aucell_df, + cluster_nn = params$cluster_nn, + cluster_res = params$cluster_res + ) + +cell_types <- unique(all_markers_df$cell_type) + +# get the mean expression of all genes for each cell state +gene_exp_df <- cell_types |> + purrr::map(\(type){ + calculate_mean_markers(all_markers_df, sce, type, cell_type) + }) |> + purrr::reduce(dplyr::inner_join, by = "barcodes") + +all_info_df <- all_results_df |> + dplyr::left_join(gene_exp_df, by = "barcodes") +``` + +## Summary of workflow results + +TODO: Insert plots that will summarize findings from each of the workflows +- UMAPs of SingleR, clusters, AUC values and custom gene set means +- Density plots by cluster of AUC values and custom gene set means +- Maybe heatmaps with cluster annotation of AUC scores and custom gene set means + +## Re-cluster tumor cells {.manual-exploration} + + + +TODO: Functions for re-clustering tumor cells +Show the same plots across the tumor only clusters and assign tumor cell states to each cluster + +## Additional exploration {.manual-exploration} + + + +## Validate final tumor and normal annotation {.manual-exploration} + + + +TODO: Insert plots that will be useful for validation (UMAPs, heatmaps, density plots) + +## Prepare annotations {.manual-exploration} + + + +TODO: Code and instructions for exporting such as what columns should be named. + +## Session info + +```{r session info} +# record the versions of the packages used in this analysis and other environment information +sessionInfo() +``` + diff --git a/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R new file mode 100644 index 000000000..ada3f5276 --- /dev/null +++ b/analyses/cell-type-ewings/template_notebooks/utils/setup-functions.R @@ -0,0 +1,80 @@ +# These functions are used in `celltype-exploration.Rmd` +# They are used for reading in and setting up the cell type results + +#' Combine workflow results into a single data frame +#' +#' Note that this function will only include clustering results from Leiden with modularity in the output +#' +#' @param sce Processed SingleCellExperiment object with UMAP embeddings +#' @param singler_df Data frame with results from `aucell-singler-annotation.sh` workflow +#' @param cluster_df Data frame with results from `evaluate-clusters.sh` workflow +#' @param aucell_df Data frame with results from `run-aucell-ews-signatures.sh` workflow +#' @param cluster_nn Value of nearest neighbors to use for cluster results. Default is 20. +#' @param cluster_res Value of resolution to use for cluster results. Default is 20. +#' +prep_results <- function( + sce, + singler_df, + cluster_df, + aucell_df, + cluster_nn = 20, + cluster_res = 0.5 +) { + + ## grab UMAP + umap_df <- sce |> + scuttle::makePerCellDF(use.dimred = "UMAP") |> + # replace UMAP.1 with UMAP1 and get rid of excess columns + dplyr::select(barcodes, UMAP1 = UMAP.1, UMAP2 = UMAP.2) + + ## prep singler data + singler_df <- singler_df |> + dplyr::mutate( + # first grab anything that is tumor and label it tumor + # NA should be unknown + singler_annotation = dplyr::case_when( + stringr::str_detect(singler_annotation, "tumor") ~ "tumor", + is.na(singler_annotation) ~ "unknown", # make sure to separate out unknown labels + .default = singler_annotation + ) |> + forcats::fct_relevel("tumor", after = 0), + # get the top cell types for plotting later + singler_lumped = singler_annotation |> + forcats::fct_lump_n(7, other_level = "All remaining cell types", ties.method = "first") |> + forcats::fct_infreq() |> + forcats::fct_relevel("All remaining cell types", after = Inf) + ) + + ## prep cluster data + cluster_df <- cluster_df |> + # filter to the clustering results we want to use + dplyr::filter( + cluster_method == "leiden_mod", + nn == cluster_nn, + resolution == cluster_res + ) |> + dplyr::select( + barcodes = cell_id, + cluster + ) + + ## prep aucell + aucell_wide_df <- aucell_df |> + dplyr::mutate( + assignment = auc > auc_threshold + ) |> + tidyr::pivot_wider( + id_cols = "barcodes", + names_from = "gene_set", + values_from = c(auc, assignment) + ) + + ## combine into one data frame + all_results_df <- umap_df |> + dplyr::left_join(singler_df, by = c("barcodes")) |> + dplyr::left_join(cluster_df, by = c("barcodes")) |> + dplyr::left_join(aucell_wide_df, by = c("barcodes")) + + return(all_results_df) + +}