Merge branch 'development' into sjspielman/516-heatmaps-calc-jaccard

AlexsLemonade · Oct 19, 2023 · 0a77c45 · 0a77c45
2 parents bc18369 + 934df29
commit 0a77c45
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 99 deletions.
diff --git a/bin/add_celltypes_to_sce.R b/bin/add_celltypes_to_sce.R
@@ -56,59 +56,67 @@ sce <- readr::read_rds(opt$input_sce_file)
 # SingleR results --------------------------------------------------------------
 
 if(!is.null(opt$singler_results)){
-  
+
   if(!file.exists(opt$singler_results)){
     stop("Missing singleR results file")
   }
-  
+
   singler_results <- readr::read_rds(opt$singler_results)
-
-  # first just add in labels as annotations
-  sce$singler_celltype_annotation = singler_results$pruned.labels
-
+
+  # create a tibble with annotations and barcode
+  # later we'll add the annotations into colData by joining on barcodes column
+  annotations_df <- tibble::tibble(
+    barcodes = rownames(singler_results),
+    singler_celltype_annotation = singler_results$pruned.labels,
+  )
+
   # map ontology labels to cell type names, as needed
   # we can tell if ontologies were used because this will exist:
   if ("cell_ontology_df" %in% names(singler_results)) {
-    
+
     # end up with columns: barcode, singler_celltype_annotation, singler_celltype_ontology
-    colData(sce) <- colData(sce) |>
-      as.data.frame() |> 
+    colData(sce) <- annotations_df |>
       dplyr::left_join(
         # column names: ontology_id, ontology_cell_names
-        singler_results$cell_ontology_df, 
+        singler_results$cell_ontology_df,
         by = c("singler_celltype_annotation" = "ontology_id")
-      ) |> 
+      ) |>
       # rename columns
       dplyr::rename(
         # ontology should contain the original pruned labels
         singler_celltype_ontology = singler_celltype_annotation,
         # annotation contains the cell names associated with the ontology
         singler_celltype_annotation = ontology_cell_names
-      ) |>
-      DataFrame(row.names = colData(sce)$barcodes)
+      )
+
+  }
 
-  } 
+  # add annotations to colData
+  colData(sce) <- colData(sce) |>
+    as.data.frame() |>
+    dplyr::left_join(annotations_df, by = c("barcodes")) |>
+    DataFrame(row.names = colData(sce)$barcodes)
 
- # add singler info to metadata
+  # add singler info to metadata
   metadata(sce)$singler_results <- singler_results
-  metadata(sce)$reference_name <- metadata(singler_results)$reference_name
-  
-  # add note about cell type method to metadata 
+  metadata(sce)$singler_reference <- metadata(singler_results)$reference_name
+
+  # add note about cell type method to metadata
   metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "singler")
-  
+
 }
 
 # CellAssign results -----------------------------------------------------------
 
 if(!is.null(opt$cellassign_predictions)){
- 
+
   # check that cellassign predictions file was provided
   if (!file.exists(opt$cellassign_predictions)) {
     stop("Missing CellAssign predictions file")
   }
-  
+
   predictions <- readr::read_tsv(opt$cellassign_predictions)
-  
+
   # get cell type with maximum prediction value for each cell
   celltype_assignments <- predictions |>
     tidyr::pivot_longer(
@@ -119,23 +127,23 @@ if(!is.null(opt$cellassign_predictions)){
     dplyr::group_by(barcode) |>
     dplyr::slice_max(prediction, n = 1) |>
     dplyr::ungroup()
-  
+
   # join by barcode to make sure assignments are in the right order
   celltype_assignments <- data.frame(barcode = sce$barcodes) |>
     dplyr::left_join(celltype_assignments, by = "barcode")
-  
+
   # add cell type and prediction to colData
   sce$cellassign_celltype_annotation <- celltype_assignments$celltype
   sce$cellassign_max_prediction <- celltype_assignments$prediction
-  
+
   # add entire predictions matrix and ref name to metadata
   metadata(sce)$cellassign_predictions <- predictions
   metadata(sce)$cellassign_reference <- opt$cellassign_ref_name
-  
+
   # add cellassign as celltype method
   # note that if `metadata(sce)$celltype_methods` doesn't exist yet, this will
   #  come out to just the string "cellassign"
-  metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "cellassign") 
+  metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "cellassign")
 }
 
 # export annotated object with cellassign assignments

diff --git a/bin/classify_SingleR.R b/bin/classify_SingleR.R
@@ -18,14 +18,9 @@ option_list <- list(
   make_option(
     opt_str = c("--singler_model_file"),
     type = "character",
-    help = "path to file containing a single model generated for SingleR annotation. 
+    help = "path to file containing a single model generated for SingleR annotation.
             File name is expected to be in form: <model name>_model.rds."
   ),
-  make_option(
-    opt_str = c("--output_singler_annotations_file"),
-    type = "character",
-    help = "path to output TSV file that will store the SingleR annotations. Must end in .tsv"
-  ),
   make_option(
     opt_str = c("--output_singler_results_file"),
     type = "character",
@@ -49,7 +44,7 @@ opt <- parse_args(OptionParser(option_list = option_list))
 # Set up -----------------------------------------------------------------------
 
 # set seed
-set.seed(opt$random_seed)
+set.seed(opt$seed)
 
 # check that input file file exists
 if (!file.exists(opt$sce_file)) {
@@ -60,9 +55,6 @@ if (!file.exists(opt$sce_file)) {
 if (!(stringr::str_ends(opt$output_singler_results_file, ".rds"))) {
   stop("output SingleR result file name must end in .rds")
 }
-if (!(stringr::str_ends(opt$output_singler_annotations_file, ".tsv"))) {
-  stop("output SingleR annotations file name must end in .tsv")
-}
 
 # check that reference exists and filename is properly formatted
 singler_model_file <- opt$singler_model_file
@@ -103,41 +95,8 @@ singler_results <- SingleR::classifySingleR(
 # add reference name to singler_results DataFrame metadata
 metadata(singler_results)$reference_name <- reference_name
 
-
-# create data frame of annotations
-annotations_df <- tibble::tibble(
-  barcode = rownames(singler_results),
-  singler_celltype_annotation = singler_results$pruned.labels,
-)
-
-# map ontology labels to cell type names, as needed
-# we can tell if ontologies were used because this will exist:
-if ("cell_ontology_df" %in% names(singler_model)) {
-
-  # end up with columns: barcode, singler_celltype_annotation, singler_celltype_ontology
-  annotations_df <- annotations_df |>
-    dplyr::left_join(
-      # column names: ontology_id, ontology_cell_names
-      singler_model$cell_ontology_df, 
-      by = c("singler_celltype_annotation" = "ontology_id")
-    ) |> 
-    # rename columns
-    dplyr::rename(
-      singler_celltype_ontology = singler_celltype_annotation,
-      singler_celltype_annotation = ontology_cell_names
-    )
-
-  # add cell_ontology_df to singler_results DataFrame metadata
-  metadata(singler_results)$cell_ontology_df <- singler_model$cell_ontology_df
-} 
-
-
-
 # export results ---------------
 
-# first, a stand-alone tsv of annotations
-readr::write_tsv(annotations_df, opt$output_singler_annotations_file)
-
 # next, the full result to a compressed rds
 readr::write_rds(
   singler_results,

diff --git a/config/containers.config b/config/containers.config
@@ -1,5 +1,5 @@
 // Docker container images
-SCPCATOOLS_CONTAINER = 'ghcr.io/alexslemonade/scpca-tools:v0.3.0'
+SCPCATOOLS_CONTAINER = 'ghcr.io/alexslemonade/scpca-tools:edge'
 
 ALEVINFRY_CONTAINER = 'quay.io/biocontainers/alevin-fry:0.7.0--h9f5acd7_1'
 BCFTOOLS_CONTAINER = 'quay.io/biocontainers/bcftools:1.14--h88f3f91_0'

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
@@ -22,7 +22,6 @@ process classify_singler {
       classify_SingleR.R \
         --sce_file "${processed_rds}" \
         --singler_model_file "${singler_model_file}" \
-        --output_singler_annotations_file "${singler_dir}/singler_annotations.tsv" \
         --output_singler_results_file "${singler_dir}/singler_results.rds" \
         --seed ${params.seed} \
         --threads ${task.cpus}
@@ -98,7 +97,7 @@ process add_celltypes_to_sce {
     annotated_rds = "${meta.library_id}_processed_annotated.rds"
     singler_present = "${singler_dir.name}" != "NO_FILE"
     singler_results = "${singler_dir}/singler_results.rds"
-    cellassign_present = "${cellassign_dir}.name" != "NO_FILE"
+    cellassign_present = "${cellassign_dir.name}" != "NO_FILE"
     cellassign_predictions = "${cellassign_dir}/cellassign_predictions.tsv"
     cellassign_ref_name = file("${meta.cellassign_reference_file}").baseName
     """
@@ -149,6 +148,8 @@ workflow annotate_celltypes {
           meta.cellassign_dir = "${meta.celltype_publish_dir}/${meta.library_id}_cellassign";
           meta.singler_model_file = singler_model_file;
           meta.cellassign_reference_file = cellassign_reference_file;
+          meta.singler_results_file = "${meta.singler_dir}/singler_results.rds";
+          meta.cellassign_predictions_file = "${meta.cellassign_dir}/cellassign_predictions.tsv"
           // return simplified input:
           [meta, processed_sce]
         }
@@ -158,18 +159,23 @@ workflow annotate_celltypes {
       singler_input_ch = celltype_input_ch
         // add in singler model or empty file
         .map{it.toList() + [file(it[0].singler_model_file ?: empty_file)]}
-        // skip if no singleR model file
+        // skip if no singleR model file or if singleR results are already present
         .branch{
+          skip_singler: !params.repeat_celltyping && file(it[0].singler_results_file).exists()
           missing_ref: it[2].name == "NO_FILE"
           do_singler: true
         }
 
 
       // perform singleR celltyping and export results
       classify_singler(singler_input_ch.do_singler)
+
       // singleR output channel: [library_id, singler_results]
-      singler_output_ch = singler_input_ch.missing_ref
-        .map{[it[0]["library_id"], file(empty_file)]}
+      singler_output_ch = singler_input_ch.skip_singler
+        // provide existing singler results dir for those we skipped
+        .map{[it[0]["library_id"], file(it[0].singler_dir, type: 'dir')]}
+        // add empty file for missing ref samples
+        .mix(singler_input_ch.missing_ref.map{[it[0]["library_id"], file(empty_file)]} )
         // add in channel outputs
         .mix(classify_singler.out)
 
@@ -179,6 +185,7 @@ workflow annotate_celltypes {
         .map{it.toList() + [file(it[0].cellassign_reference_file ?: empty_file)]}
         // skip if no cellassign reference file or reference name is not defined
         .branch{
+          skip_cellassign: !params.repeat_celltyping && file(it[0].cellassign_predictions_file).exists()
           missing_ref: it[2].name == "NO_FILE"
           do_cellassign: true
         }
@@ -188,35 +195,38 @@ workflow annotate_celltypes {
       classify_cellassign(cellassign_input_ch.do_cellassign)
 
       // cellassign output channel: [library_id, cellassign_dir]
-      cellassign_output_ch = cellassign_input_ch.missing_ref
-        .map{[it[0]["library_id"], file(empty_file)]}
+      cellassign_output_ch = cellassign_input_ch.skip_cellassign
+        // provide existing cellassign predictions dir for those we skipped
+        .map{[it[0]["library_id"], file(it[0].cellassign_dir, type: 'dir')]}
+        // add empty file for missing ref samples
+        .mix(cellassign_input_ch.missing_ref.map{[it[0]["library_id"], file(empty_file)]} )
         // add in channel outputs
         .mix(classify_cellassign.out)
 
       // prepare input for process to add celltypes to the processed SCE
       // result is [meta, processed rds, singler dir, cellassign dir]
-      assignment_input_ch = processed_sce_channel
+      assignment_input_ch = celltype_input_ch
         .map{[it[0]["library_id"]] + it}
         // add in singler results
         .join(singler_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
         // add in cell assign results
         .join(cellassign_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
         .map{it.drop(1)} // remove library_id
         .branch{
-          // pull out libraries that actually have at least 1 type of annotations
+          // pull out libraries that actually have at least 1 type of annotation
           add_celltypes: (it[2].baseName != "NO_FILE") || (it[3].baseName != "NO_FILE")
           no_celltypes: true
         }
 
 
       // incorporate annotations into SCE object
       add_celltypes_to_sce(assignment_input_ch.add_celltypes)
-      
+
       // mix in libraries without new celltypes
       // result is [meta, proccessed rds]
       celltyped_ch = assignment_input_ch.no_celltypes
         .map{[it[0], it[1]]}
-        .mix(add_celltypes_to_sce.out) 
+        .mix(add_celltypes_to_sce.out)
 
       // add back in the unchanged sce files to the results
       export_channel = celltyped_ch

diff --git a/nextflow.config b/nextflow.config
@@ -39,6 +39,7 @@ params {
   publish_fry_outs = false // alevin-fry outputs are not published by default. Use `--publish_fry_outs` to publish these files to the `checkpoints` folder.
   spliced_only = false // include only spliced reads in counts matrix, by default both unspliced and spliced reads are totaled and found in `counts` asasy of returned SingleCellExperiment object
   perform_celltyping = false // specify whether or not to incorporate cell type annotations
+  repeat_celltyping = false // if cell type annotations already exist, skip cell type classification with SingleR and CellAssign
 
   seed = 2021   // random number seed for filtering and post-processing (0 means use system seed)
 

diff --git a/templates/qc_report/celltypes_supplemental_report.rmd b/templates/qc_report/celltypes_supplemental_report.rmd
@@ -329,22 +329,22 @@ In this section, we assess the reliability of cell type annotations using diagno
 knitr::asis_output("
 ### `SingleR` assessment
 
-Here, we evaluate the quality of `SingleR` cell type annotations by comparing the scores for the assigned cell type to the median score of all other cell type labels.
+To assess the quality of the `SingleR`-assigned cell types, we use the _delta median_ statistic.
 
-- This quantity is the _delta median_ statistic. 
-- _Delta median_ is calculated for each cell by subtracting the median score of all other cell type labels from the score of the assigned cell type label.
+- _Delta median_ is calculated for each cell as the difference between the `SingleR` score of the assigned cell type label and the median score of the other cell type labels in the reference dataset.
 - Higher _delta median_ values indicate higher quality cell type annotations.
-  - However, there is no universal threshold for calling absolute high vs. low quality, as described in the [`SingleR` book section on 'Annotation diagnostics'](https://bioconductor.org/books/release/SingleRBook/annotation-diagnostics.html#annotation-diagnostics).
+  - Values can range from 0-1.
+  - Note that there is no universal threshold for calling absolute high vs. low quality, as described in the [`SingleR` book section on 'Annotation diagnostics'](https://bioconductor.org/books/release/SingleRBook/annotation-diagnostics.html#annotation-diagnostics).
 
 You can interpret this plot as follows:
 
-- Each point represents the _delta median_ statistic of a given cell whose final `SingleR` annotation is shown on the y-axis.
-- The color of the points indicates how confident `SingleR` is in the cell type annotation: 
+- Each point represents the _delta median_ statistic of a given cell whose assigned `SingleR` annotation is shown on the y-axis.
+- The point color indicates `SingleR`'s quality assessment of the annotation:
   - High-quality cell annotations are shown as closed points.
-  - Low-quality cell annotations are shown as open points. 
+  - Low-quality cell annotations are shown as open points.
   In other sections of this report, these cells are referred to as `Unknown cell types`.
   - For more information on how `SingleR` calculates annotation quality, please refer to [this `SingleR` documentation](https://rdrr.io/bioc/SingleR/man/pruneScores.html).
-- Red diamonds represent the median _delta median_ statistic among high-quality points for the given annotation.
+- Red diamonds represent the median _delta median_ statistic among high-quality annotations for the given cell type label.
 ")
 ```
 
@@ -364,7 +364,7 @@ delta_median_df <- tibble::tibble(
   # so, negate for this variable:
   confident = !is.na(metadata(processed_sce)$singler_result$pruned.labels)
 ) |>
-  dplyr::mutate(confident = 
+  dplyr::mutate(confident =
     ifelse(confident, "High-quality", "Low-quality")
   )
 
@@ -417,7 +417,7 @@ ggplot(delta_median_df) +
   aes(
     x = delta_median,
     y = annotation_wrapped,
-    shape = confident, 
+    shape = confident,
     alpha = confident
   ) +
   ggforce::geom_sina(
@@ -426,7 +426,7 @@ ggplot(delta_median_df) +
     fill = "white", # will apply to non-confident fill only
     position = position_dodge(width = 0.05) # Keep both types of points mostly in line
   ) +
-  # Handle points aesthetics: 
+  # Handle points aesthetics:
   #  confident are closed black with alpha = 0.5
   #  not confident are open black with alpha = 1
   scale_shape_manual(values = c(19, 21)) +
@@ -438,16 +438,16 @@ ggplot(delta_median_df) +
   ) +
   # add median diamond for confident points only
   stat_summary(
-    data = delta_median_confident_df, 
+    data = delta_median_confident_df,
     color = "red",
-    geom = "point", 
-    fun = "median", 
-    shape = 18, 
-    size = 2.25, 
+    geom = "point",
+    fun = "median",
+    shape = 18,
+    size = 2.25,
     alpha = 0.9
   ) +
   guides(
-    alpha = FALSE, 
+    alpha = FALSE,
     shape = guide_legend(override.aes = list(size = 1.5, alpha = 0.55))
   ) +
   theme(