Skip to content

Commit

Permalink
Merge branch 'development' into sjspielman/516-heatmaps-calc-jaccard
Browse files Browse the repository at this point in the history
  • Loading branch information
sjspielman authored Oct 19, 2023
2 parents bc18369 + 934df29 commit 0a77c45
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 99 deletions.
62 changes: 35 additions & 27 deletions bin/add_celltypes_to_sce.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,59 +56,67 @@ sce <- readr::read_rds(opt$input_sce_file)
# SingleR results --------------------------------------------------------------

if(!is.null(opt$singler_results)){

if(!file.exists(opt$singler_results)){
stop("Missing singleR results file")
}

singler_results <- readr::read_rds(opt$singler_results)

# first just add in labels as annotations
sce$singler_celltype_annotation = singler_results$pruned.labels


# create a tibble with annotations and barcode
# later we'll add the annotations into colData by joining on barcodes column
annotations_df <- tibble::tibble(
barcodes = rownames(singler_results),
singler_celltype_annotation = singler_results$pruned.labels,
)

# map ontology labels to cell type names, as needed
# we can tell if ontologies were used because this will exist:
if ("cell_ontology_df" %in% names(singler_results)) {

# end up with columns: barcode, singler_celltype_annotation, singler_celltype_ontology
colData(sce) <- colData(sce) |>
as.data.frame() |>
colData(sce) <- annotations_df |>
dplyr::left_join(
# column names: ontology_id, ontology_cell_names
singler_results$cell_ontology_df,
singler_results$cell_ontology_df,
by = c("singler_celltype_annotation" = "ontology_id")
) |>
) |>
# rename columns
dplyr::rename(
# ontology should contain the original pruned labels
singler_celltype_ontology = singler_celltype_annotation,
# annotation contains the cell names associated with the ontology
singler_celltype_annotation = ontology_cell_names
) |>
DataFrame(row.names = colData(sce)$barcodes)
)

}

}
# add annotations to colData
colData(sce) <- colData(sce) |>
as.data.frame() |>
dplyr::left_join(annotations_df, by = c("barcodes")) |>
DataFrame(row.names = colData(sce)$barcodes)

# add singler info to metadata
# add singler info to metadata
metadata(sce)$singler_results <- singler_results
metadata(sce)$reference_name <- metadata(singler_results)$reference_name
# add note about cell type method to metadata
metadata(sce)$singler_reference <- metadata(singler_results)$reference_name

# add note about cell type method to metadata
metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "singler")

}

# CellAssign results -----------------------------------------------------------

if(!is.null(opt$cellassign_predictions)){

# check that cellassign predictions file was provided
if (!file.exists(opt$cellassign_predictions)) {
stop("Missing CellAssign predictions file")
}

predictions <- readr::read_tsv(opt$cellassign_predictions)

# get cell type with maximum prediction value for each cell
celltype_assignments <- predictions |>
tidyr::pivot_longer(
Expand All @@ -119,23 +127,23 @@ if(!is.null(opt$cellassign_predictions)){
dplyr::group_by(barcode) |>
dplyr::slice_max(prediction, n = 1) |>
dplyr::ungroup()

# join by barcode to make sure assignments are in the right order
celltype_assignments <- data.frame(barcode = sce$barcodes) |>
dplyr::left_join(celltype_assignments, by = "barcode")

# add cell type and prediction to colData
sce$cellassign_celltype_annotation <- celltype_assignments$celltype
sce$cellassign_max_prediction <- celltype_assignments$prediction

# add entire predictions matrix and ref name to metadata
metadata(sce)$cellassign_predictions <- predictions
metadata(sce)$cellassign_reference <- opt$cellassign_ref_name

# add cellassign as celltype method
# note that if `metadata(sce)$celltype_methods` doesn't exist yet, this will
# come out to just the string "cellassign"
metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "cellassign")
metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "cellassign")
}

# export annotated object with cellassign assignments
Expand Down
45 changes: 2 additions & 43 deletions bin/classify_SingleR.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,9 @@ option_list <- list(
make_option(
opt_str = c("--singler_model_file"),
type = "character",
help = "path to file containing a single model generated for SingleR annotation.
help = "path to file containing a single model generated for SingleR annotation.
File name is expected to be in form: <model name>_model.rds."
),
make_option(
opt_str = c("--output_singler_annotations_file"),
type = "character",
help = "path to output TSV file that will store the SingleR annotations. Must end in .tsv"
),
make_option(
opt_str = c("--output_singler_results_file"),
type = "character",
Expand All @@ -49,7 +44,7 @@ opt <- parse_args(OptionParser(option_list = option_list))
# Set up -----------------------------------------------------------------------

# set seed
set.seed(opt$random_seed)
set.seed(opt$seed)

# check that input file file exists
if (!file.exists(opt$sce_file)) {
Expand All @@ -60,9 +55,6 @@ if (!file.exists(opt$sce_file)) {
if (!(stringr::str_ends(opt$output_singler_results_file, ".rds"))) {
stop("output SingleR result file name must end in .rds")
}
if (!(stringr::str_ends(opt$output_singler_annotations_file, ".tsv"))) {
stop("output SingleR annotations file name must end in .tsv")
}

# check that reference exists and filename is properly formatted
singler_model_file <- opt$singler_model_file
Expand Down Expand Up @@ -103,41 +95,8 @@ singler_results <- SingleR::classifySingleR(
# add reference name to singler_results DataFrame metadata
metadata(singler_results)$reference_name <- reference_name


# create data frame of annotations
annotations_df <- tibble::tibble(
barcode = rownames(singler_results),
singler_celltype_annotation = singler_results$pruned.labels,
)

# map ontology labels to cell type names, as needed
# we can tell if ontologies were used because this will exist:
if ("cell_ontology_df" %in% names(singler_model)) {

# end up with columns: barcode, singler_celltype_annotation, singler_celltype_ontology
annotations_df <- annotations_df |>
dplyr::left_join(
# column names: ontology_id, ontology_cell_names
singler_model$cell_ontology_df,
by = c("singler_celltype_annotation" = "ontology_id")
) |>
# rename columns
dplyr::rename(
singler_celltype_ontology = singler_celltype_annotation,
singler_celltype_annotation = ontology_cell_names
)

# add cell_ontology_df to singler_results DataFrame metadata
metadata(singler_results)$cell_ontology_df <- singler_model$cell_ontology_df
}



# export results ---------------

# first, a stand-alone tsv of annotations
readr::write_tsv(annotations_df, opt$output_singler_annotations_file)

# next, the full result to a compressed rds
readr::write_rds(
singler_results,
Expand Down
2 changes: 1 addition & 1 deletion config/containers.config
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Docker container images
SCPCATOOLS_CONTAINER = 'ghcr.io/alexslemonade/scpca-tools:v0.3.0'
SCPCATOOLS_CONTAINER = 'ghcr.io/alexslemonade/scpca-tools:edge'

ALEVINFRY_CONTAINER = 'quay.io/biocontainers/alevin-fry:0.7.0--h9f5acd7_1'
BCFTOOLS_CONTAINER = 'quay.io/biocontainers/bcftools:1.14--h88f3f91_0'
Expand Down
32 changes: 21 additions & 11 deletions modules/classify-celltypes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ process classify_singler {
classify_SingleR.R \
--sce_file "${processed_rds}" \
--singler_model_file "${singler_model_file}" \
--output_singler_annotations_file "${singler_dir}/singler_annotations.tsv" \
--output_singler_results_file "${singler_dir}/singler_results.rds" \
--seed ${params.seed} \
--threads ${task.cpus}
Expand Down Expand Up @@ -98,7 +97,7 @@ process add_celltypes_to_sce {
annotated_rds = "${meta.library_id}_processed_annotated.rds"
singler_present = "${singler_dir.name}" != "NO_FILE"
singler_results = "${singler_dir}/singler_results.rds"
cellassign_present = "${cellassign_dir}.name" != "NO_FILE"
cellassign_present = "${cellassign_dir.name}" != "NO_FILE"
cellassign_predictions = "${cellassign_dir}/cellassign_predictions.tsv"
cellassign_ref_name = file("${meta.cellassign_reference_file}").baseName
"""
Expand Down Expand Up @@ -149,6 +148,8 @@ workflow annotate_celltypes {
meta.cellassign_dir = "${meta.celltype_publish_dir}/${meta.library_id}_cellassign";
meta.singler_model_file = singler_model_file;
meta.cellassign_reference_file = cellassign_reference_file;
meta.singler_results_file = "${meta.singler_dir}/singler_results.rds";
meta.cellassign_predictions_file = "${meta.cellassign_dir}/cellassign_predictions.tsv"
// return simplified input:
[meta, processed_sce]
}
Expand All @@ -158,18 +159,23 @@ workflow annotate_celltypes {
singler_input_ch = celltype_input_ch
// add in singler model or empty file
.map{it.toList() + [file(it[0].singler_model_file ?: empty_file)]}
// skip if no singleR model file
// skip if no singleR model file or if singleR results are already present
.branch{
skip_singler: !params.repeat_celltyping && file(it[0].singler_results_file).exists()
missing_ref: it[2].name == "NO_FILE"
do_singler: true
}


// perform singleR celltyping and export results
classify_singler(singler_input_ch.do_singler)

// singleR output channel: [library_id, singler_results]
singler_output_ch = singler_input_ch.missing_ref
.map{[it[0]["library_id"], file(empty_file)]}
singler_output_ch = singler_input_ch.skip_singler
// provide existing singler results dir for those we skipped
.map{[it[0]["library_id"], file(it[0].singler_dir, type: 'dir')]}
// add empty file for missing ref samples
.mix(singler_input_ch.missing_ref.map{[it[0]["library_id"], file(empty_file)]} )
// add in channel outputs
.mix(classify_singler.out)

Expand All @@ -179,6 +185,7 @@ workflow annotate_celltypes {
.map{it.toList() + [file(it[0].cellassign_reference_file ?: empty_file)]}
// skip if no cellassign reference file or reference name is not defined
.branch{
skip_cellassign: !params.repeat_celltyping && file(it[0].cellassign_predictions_file).exists()
missing_ref: it[2].name == "NO_FILE"
do_cellassign: true
}
Expand All @@ -188,35 +195,38 @@ workflow annotate_celltypes {
classify_cellassign(cellassign_input_ch.do_cellassign)

// cellassign output channel: [library_id, cellassign_dir]
cellassign_output_ch = cellassign_input_ch.missing_ref
.map{[it[0]["library_id"], file(empty_file)]}
cellassign_output_ch = cellassign_input_ch.skip_cellassign
// provide existing cellassign predictions dir for those we skipped
.map{[it[0]["library_id"], file(it[0].cellassign_dir, type: 'dir')]}
// add empty file for missing ref samples
.mix(cellassign_input_ch.missing_ref.map{[it[0]["library_id"], file(empty_file)]} )
// add in channel outputs
.mix(classify_cellassign.out)

// prepare input for process to add celltypes to the processed SCE
// result is [meta, processed rds, singler dir, cellassign dir]
assignment_input_ch = processed_sce_channel
assignment_input_ch = celltype_input_ch
.map{[it[0]["library_id"]] + it}
// add in singler results
.join(singler_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
// add in cell assign results
.join(cellassign_output_ch, by: 0, failOnMismatch: true, failOnDuplicate: true)
.map{it.drop(1)} // remove library_id
.branch{
// pull out libraries that actually have at least 1 type of annotations
// pull out libraries that actually have at least 1 type of annotation
add_celltypes: (it[2].baseName != "NO_FILE") || (it[3].baseName != "NO_FILE")
no_celltypes: true
}


// incorporate annotations into SCE object
add_celltypes_to_sce(assignment_input_ch.add_celltypes)

// mix in libraries without new celltypes
// result is [meta, proccessed rds]
celltyped_ch = assignment_input_ch.no_celltypes
.map{[it[0], it[1]]}
.mix(add_celltypes_to_sce.out)
.mix(add_celltypes_to_sce.out)

// add back in the unchanged sce files to the results
export_channel = celltyped_ch
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ params {
publish_fry_outs = false // alevin-fry outputs are not published by default. Use `--publish_fry_outs` to publish these files to the `checkpoints` folder.
spliced_only = false // include only spliced reads in counts matrix, by default both unspliced and spliced reads are totaled and found in `counts` asasy of returned SingleCellExperiment object
perform_celltyping = false // specify whether or not to incorporate cell type annotations
repeat_celltyping = false // if cell type annotations already exist, skip cell type classification with SingleR and CellAssign

seed = 2021 // random number seed for filtering and post-processing (0 means use system seed)

Expand Down
34 changes: 17 additions & 17 deletions templates/qc_report/celltypes_supplemental_report.rmd
Original file line number Diff line number Diff line change
Expand Up @@ -329,22 +329,22 @@ In this section, we assess the reliability of cell type annotations using diagno
knitr::asis_output("
### `SingleR` assessment
Here, we evaluate the quality of `SingleR` cell type annotations by comparing the scores for the assigned cell type to the median score of all other cell type labels.
To assess the quality of the `SingleR`-assigned cell types, we use the _delta median_ statistic.
- This quantity is the _delta median_ statistic.
- _Delta median_ is calculated for each cell by subtracting the median score of all other cell type labels from the score of the assigned cell type label.
- _Delta median_ is calculated for each cell as the difference between the `SingleR` score of the assigned cell type label and the median score of the other cell type labels in the reference dataset.
- Higher _delta median_ values indicate higher quality cell type annotations.
- However, there is no universal threshold for calling absolute high vs. low quality, as described in the [`SingleR` book section on 'Annotation diagnostics'](https://bioconductor.org/books/release/SingleRBook/annotation-diagnostics.html#annotation-diagnostics).
- Values can range from 0-1.
- Note that there is no universal threshold for calling absolute high vs. low quality, as described in the [`SingleR` book section on 'Annotation diagnostics'](https://bioconductor.org/books/release/SingleRBook/annotation-diagnostics.html#annotation-diagnostics).
You can interpret this plot as follows:
- Each point represents the _delta median_ statistic of a given cell whose final `SingleR` annotation is shown on the y-axis.
- The color of the points indicates how confident `SingleR` is in the cell type annotation:
- Each point represents the _delta median_ statistic of a given cell whose assigned `SingleR` annotation is shown on the y-axis.
- The point color indicates `SingleR`'s quality assessment of the annotation:
- High-quality cell annotations are shown as closed points.
- Low-quality cell annotations are shown as open points.
- Low-quality cell annotations are shown as open points.
In other sections of this report, these cells are referred to as `Unknown cell types`.
- For more information on how `SingleR` calculates annotation quality, please refer to [this `SingleR` documentation](https://rdrr.io/bioc/SingleR/man/pruneScores.html).
- Red diamonds represent the median _delta median_ statistic among high-quality points for the given annotation.
- Red diamonds represent the median _delta median_ statistic among high-quality annotations for the given cell type label.
")
```

Expand All @@ -364,7 +364,7 @@ delta_median_df <- tibble::tibble(
# so, negate for this variable:
confident = !is.na(metadata(processed_sce)$singler_result$pruned.labels)
) |>
dplyr::mutate(confident =
dplyr::mutate(confident =
ifelse(confident, "High-quality", "Low-quality")
)
Expand Down Expand Up @@ -417,7 +417,7 @@ ggplot(delta_median_df) +
aes(
x = delta_median,
y = annotation_wrapped,
shape = confident,
shape = confident,
alpha = confident
) +
ggforce::geom_sina(
Expand All @@ -426,7 +426,7 @@ ggplot(delta_median_df) +
fill = "white", # will apply to non-confident fill only
position = position_dodge(width = 0.05) # Keep both types of points mostly in line
) +
# Handle points aesthetics:
# Handle points aesthetics:
# confident are closed black with alpha = 0.5
# not confident are open black with alpha = 1
scale_shape_manual(values = c(19, 21)) +
Expand All @@ -438,16 +438,16 @@ ggplot(delta_median_df) +
) +
# add median diamond for confident points only
stat_summary(
data = delta_median_confident_df,
data = delta_median_confident_df,
color = "red",
geom = "point",
fun = "median",
shape = 18,
size = 2.25,
geom = "point",
fun = "median",
shape = 18,
size = 2.25,
alpha = 0.9
) +
guides(
alpha = FALSE,
alpha = FALSE,
shape = guide_legend(override.aes = list(size = 1.5, alpha = 0.55))
) +
theme(
Expand Down

0 comments on commit 0a77c45

Please sign in to comment.