diff --git a/bin/add_celltypes_to_sce.R b/bin/add_celltypes_to_sce.R index 8530518a..fa5fc7b4 100755 --- a/bin/add_celltypes_to_sce.R +++ b/bin/add_celltypes_to_sce.R @@ -42,6 +42,12 @@ option_list <- list( type = "character", help = "Name of marker by cell type reference file used with CellAssign. File name is expected to be in form: `__.tsv`" + ), + make_option( + opt_str = c("--celltype_ref_metafile"), + type = "character", + help = "Metadata TSV file containing cell type reference metadata. + This file is used to obtain a list of organs used for CellAssign annotation and is only required if CellAssign results provided as input." ) ) @@ -177,6 +183,10 @@ if (!is.null(opt$cellassign_predictions)) { stop("CellAssign reference filename must be provided") } + if (is.null(opt$celltype_ref_metafile)) { + stop("Cell type reference metadata filename must be provided") + } + predictions <- readr::read_tsv(opt$cellassign_predictions) # get cell type with maximum prediction value for each cell @@ -216,6 +226,17 @@ if (!is.null(opt$cellassign_predictions)) { # note that if `metadata(sce)$celltype_methods` doesn't exist yet, this will # come out to just the string "cellassign" metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "cellassign") + + # add cellassign reference organs to metadata + cellassign_organs <- opt$celltype_ref_metafile |> + readr::read_tsv() |> + dplyr::filter(celltype_ref_name == cellassign_ref_info[["ref_name"]]) |> + dplyr::pull(organs) + + if (cellassign_organs == "" | is.na(cellassign_organs)) { + stop("Failed to obtain CellAssign reference organ list.") + } + metadata(sce)$cellassign_reference_organs <- cellassign_organs } # export annotated object with cellassign assignments diff --git a/external-instructions.md b/external-instructions.md index cd77f1d6..cc90b69d 100644 --- a/external-instructions.md +++ b/external-instructions.md @@ -78,12 +78,12 @@ Using the above command will run the workflow from the `main` branch of the work To update to the latest released version you can run `nextflow pull AlexsLemonade/scpca-nf` before the `nextflow run` command. To be sure that you are using a consistent version, you can specify use of a release tagged version of the workflow, set below with the `-r` flag. -The command below will pull the `scpca-nf` workflow directly from Github using the `v0.7.0` version. +The command below will pull the `scpca-nf` workflow directly from Github using the `v0.7.1` version. Released versions can be found on the [`scpca-nf` repository releases page](https://github.com/AlexsLemonade/scpca-nf/releases). ```sh nextflow run AlexsLemonade/scpca-nf \ - -r v0.7.0 \ + -r v0.7.1 \ -config \ -profile ``` @@ -312,7 +312,7 @@ If you will be analyzing spatial expression data, you will also need the Cell Ra If your compute nodes do not have internet access, you will likely have to pre-pull the required container images as well. When doing this, it is important to be sure that you also specify the revision (version tag) of the `scpca-nf` workflow that you are using. -For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.7.0`, then you will want to set `-r v0.7.0` for `get_refs.py` as well to be sure you have the correct containers. +For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.7.1`, then you will want to set `-r v0.7.1` for `get_refs.py` as well to be sure you have the correct containers. By default, `get_refs.py` will download files and images associated with the latest release. If your system uses Docker, you can add the `--docker` flag: diff --git a/internal-instructions.md b/internal-instructions.md index 32b84a2d..25c4544b 100644 --- a/internal-instructions.md +++ b/internal-instructions.md @@ -82,7 +82,7 @@ Please refer to our [`CONTRIBUTING.md`](CONTRIBUTING.md#stub-workflows) for more When running the workflow for a project or group of samples that is ready to be released on ScPCA portal, please use the tag for the latest release: ``` -nextflow run AlexsLemonade/scpca-nf -r v0.7.0 -profile ccdl,batch --project SCPCP000000 +nextflow run AlexsLemonade/scpca-nf -r v0.7.1 -profile ccdl,batch --project SCPCP000000 ``` ### Processing example data diff --git a/lib/Utils.groovy b/lib/Utils.groovy index 01242ac4..15fcf4b1 100644 --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -1,6 +1,8 @@ import groovy.json.JsonSlurper import groovy.json.JsonOutput +import nextflow.script.WorkflowMetadata +import nextflow.NextflowMeta /** * Utility functions for scpca-nf @@ -24,10 +26,12 @@ class Utils { * Write a metadata map to a JSON string * * @param meta A metadata map + * @param updates A map of updates to the metadata * @return A JSON string */ - static def makeJson(meta) { - def meta_json = JsonOutput.toJson(meta) + static def makeJson(meta, updates = [:]) { + def meta_out = meta + updates // merge/overwrite the metadata with any updates + def meta_json = JsonOutput.toJson(meta_out) meta_json = JsonOutput.prettyPrint(meta_json) return(meta_json) } @@ -64,4 +68,19 @@ class Utils { '' } } + + + /** + * Make a map of versions for the workflow and nextflow, for adding to metadata + * + * @param workflow A nextflow WorkflowMetadata object containing the revision and manifest + * @param nextflow A NextflowMeta object (created by the nextflow runtime) + * @return A map with keys "scpca_version" and "nextflow_version" + */ + static def getVersions(WorkflowMetadata workflow, NextflowMeta nextflow){ + [ + scpca_version : workflow.revision ?: workflow.manifest.version, + nextflow_version : nextflow.version.toString() + ] + } } diff --git a/main.nf b/main.nf index d2ac5e6b..467c6d92 100644 --- a/main.nf +++ b/main.nf @@ -63,6 +63,18 @@ if (!resolution_strategies.contains(params.af_resolution)) { param_error = true } +// cell type annotation file checks +if (params.perform_celltyping) { + if (!file(params.project_celltype_metafile).exists()) { + log.error("The 'project_celltype_metafile' file '${params.project_celltype_metafile}' can not be found.") + param_error = true + } + if (!file(params.celltype_ref_metadata).exists()) { + log.error("The 'celltype_ref_metadata' file '${params.celltype_ref_metadata}' can not be found.") + param_error = true + } +} + if(param_error){ System.exit(1) } @@ -118,7 +130,7 @@ workflow { t2g_bulk_path: params.ref_rootdir + "/" + sample_refs["t2g_bulk_path"], cellranger_index: params.ref_rootdir + "/" + sample_refs["cellranger_index"], star_index: params.ref_rootdir + "/" + sample_refs["star_index"], - scpca_version: workflow.manifest.version, + scpca_version: workflow.revision ?: workflow.manifest.version, nextflow_version: nextflow.version.toString() ] } diff --git a/modules/af-features.nf b/modules/af-features.nf index e60550fe..6c5b5e94 100644 --- a/modules/af-features.nf +++ b/modules/af-features.nf @@ -53,6 +53,8 @@ process alevin_feature{ tech_version = meta.technology.split('_').last() umi_geom = umi_geom_map[tech_version] // get meta to write as file + // make sure workflow version strings are correct + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p ${run_dir} @@ -74,6 +76,7 @@ process alevin_feature{ """ stub: run_dir = "${meta.run_id}-features" + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p ${run_dir} @@ -95,6 +98,7 @@ process fry_quant_feature{ script: quant_dir = rad_dir + '_quant' // get meta to write as file + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ alevin-fry generate-permit-list \ @@ -124,6 +128,7 @@ process fry_quant_feature{ """ stub: quant_dir = rad_dir + '_quant' + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p '${quant_dir}' diff --git a/modules/af-rna.nf b/modules/af-rna.nf index b0f1f144..d8dcbb95 100644 --- a/modules/af-rna.nf +++ b/modules/af-rna.nf @@ -23,6 +23,7 @@ process alevin_rad{ '10Xv3.1': '--chromiumV3' ] // get meta to write as file + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) // run alevin like normal with the --rad flag // creates output directory with RAD file needed for alevin-fry @@ -43,6 +44,7 @@ process alevin_rad{ """ stub: rad_dir = file(meta.rad_dir).name + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p ${rad_dir} @@ -66,6 +68,7 @@ process fry_quant_rna{ script: quant_dir = rad_dir + '_quant' // get meta to write as file + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ alevin-fry generate-permit-list \ @@ -95,6 +98,7 @@ process fry_quant_rna{ """ stub: quant_dir = rad_dir + '_quant' + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p '${quant_dir}' diff --git a/modules/bulk-salmon.nf b/modules/bulk-salmon.nf index 37169a94..02196bc4 100644 --- a/modules/bulk-salmon.nf +++ b/modules/bulk-salmon.nf @@ -39,6 +39,7 @@ process salmon{ script: salmon_dir = file(meta.salmon_results_dir).name // get meta to write as file + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ salmon quant -i ${index} \ @@ -56,6 +57,7 @@ process salmon{ """ stub: salmon_dir = file(meta.salmon_results_dir).name + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p ${salmon_dir} diff --git a/modules/cellsnp.nf b/modules/cellsnp.nf index fdb22550..c7214e00 100644 --- a/modules/cellsnp.nf +++ b/modules/cellsnp.nf @@ -51,6 +51,7 @@ process vireo{ tuple val(meta), path(outdir) script: outdir = file(meta.vireo_dir).name + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ vireo \ @@ -63,6 +64,7 @@ process vireo{ """ stub: outdir = file(meta.vireo_dir).name + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p ${outdir} @@ -99,4 +101,3 @@ workflow cellsnp_vireo { emit: vireo.out } - diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf index 143d3c6b..235de35d 100644 --- a/modules/classify-celltypes.nf +++ b/modules/classify-celltypes.nf @@ -15,6 +15,8 @@ process classify_singler { tuple val(meta.library_id), path(singler_dir) script: singler_dir = file(meta.singler_dir).name + meta += Utils.getVersions(workflow, nextflow) + meta_json = Utils.makeJson(meta) """ # create output directory mkdir "${singler_dir}" @@ -27,13 +29,15 @@ process classify_singler { --threads ${task.cpus} # write out meta file - echo '${Utils.makeJson(meta)}' > "${singler_dir}/scpca-meta.json" + echo '${meta_json}' > "${singler_dir}/scpca-meta.json" """ stub: singler_dir = file(meta.singler_dir).name + meta += Utils.getVersions(workflow, nextflow) + meta_json = Utils.makeJson(meta) """ mkdir "${singler_dir}" - echo '${Utils.makeJson(meta)}' > "${singler_dir}/scpca-meta.json" + echo '${meta_json}' > "${singler_dir}/scpca-meta.json" """ } @@ -54,7 +58,8 @@ process classify_cellassign { tuple val(meta.library_id), path(cellassign_dir) script: cellassign_dir = file(meta.cellassign_dir).name - + meta += Utils.getVersions(workflow, nextflow) + meta_json = Utils.makeJson(meta) """ # create output directory mkdir "${cellassign_dir}" @@ -73,13 +78,15 @@ process classify_cellassign { --threads ${task.cpus} # write out meta file - echo '${Utils.makeJson(meta)}' > "${cellassign_dir}/scpca-meta.json" + echo '${meta_json}' > "${cellassign_dir}/scpca-meta.json" """ stub: cellassign_dir = file(meta.cellassign_dir).name + meta += Utils.getVersions(workflow, nextflow) + meta_json = Utils.makeJson(meta) """ mkdir "${cellassign_dir}" - echo '${Utils.makeJson(meta)}' > "${cellassign_dir}/scpca-meta.json" + echo '${meta_json}' > "${cellassign_dir}/scpca-meta.json" """ } @@ -90,6 +97,7 @@ process add_celltypes_to_sce { tag "${meta.library_id}" input: tuple val(meta), path(processed_rds), path(singler_dir), path(cellassign_dir) + path(celltype_ref_metadata) // TSV file of references metadata needed for CellAssign only output: tuple val(meta), path(annotated_rds) script: @@ -105,7 +113,8 @@ process add_celltypes_to_sce { ${singler_present ? "--singler_results ${singler_results}" : ''} \ ${singler_present ? "--singler_model_file ${meta.singler_model_file}" : ''} \ ${cellassign_present ? "--cellassign_predictions ${cellassign_predictions}" : ''} \ - ${cellassign_present ? "--cellassign_ref_file ${meta.cellassign_reference_file}" : ''} + ${cellassign_present ? "--cellassign_ref_file ${meta.cellassign_reference_file}" : ''} \ + ${cellassign_present ? "--celltype_ref_metafile ${celltype_ref_metadata}" : ''} """ stub: annotated_rds = "${meta.library_id}_processed_annotated.rds" @@ -183,7 +192,7 @@ workflow annotate_celltypes { .mix(classify_singler.out) // create cellassign input channel: [meta, processed sce, cellassign reference file] - cellassign_input_ch = celltype_input_ch + cellassign_input_ch = celltype_input_ch // add in cellassign reference .map{it.toList() + [file(it[0].cellassign_reference_file ?: empty_file)]} // skip if no cellassign reference file or reference name is not defined @@ -227,7 +236,10 @@ workflow annotate_celltypes { // incorporate annotations into SCE object - add_celltypes_to_sce(assignment_input_ch.add_celltypes) + add_celltypes_to_sce( + assignment_input_ch.add_celltypes, + file(params.celltype_ref_metadata) // file with CellAssign reference organs + ) // mix in libraries without new celltypes // result is [meta, proccessed rds] diff --git a/modules/spaceranger.nf b/modules/spaceranger.nf index 43f8c27e..860b68f9 100644 --- a/modules/spaceranger.nf +++ b/modules/spaceranger.nf @@ -14,6 +14,7 @@ process spaceranger{ tuple val(meta), path(out_id) script: out_id = file(meta.spaceranger_results_dir).name + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ spaceranger count \ @@ -35,6 +36,7 @@ process spaceranger{ """ stub: out_id = file(meta.spaceranger_results_dir).name + meta += Utils.getVersions(workflow, nextflow) meta_json = Utils.makeJson(meta) """ mkdir -p ${out_id}/outs diff --git a/nextflow.config b/nextflow.config index cc39abef..8689da07 100644 --- a/nextflow.config +++ b/nextflow.config @@ -5,7 +5,7 @@ manifest{ homePage = 'https://github.com/AlexsLemonade/scpca-nf' mainScript = 'main.nf' defaultBranch = 'main' - version = 'v0.7.0' + version = 'v0.7.1' } // global parameters for workflows diff --git a/templates/qc_report/celltypes_qc.rmd b/templates/qc_report/celltypes_qc.rmd index 55defb1b..fe6ee854 100644 --- a/templates/qc_report/celltypes_qc.rmd +++ b/templates/qc_report/celltypes_qc.rmd @@ -55,7 +55,7 @@ lump_celltypes <- function(df, n_celltypes = 7) { dplyr::mutate( across( ends_with("_celltype_annotation"), - \(x) forcats::fct_lump_n(x, n_celltypes, other_level = "All remaining cell types"), + \(x) forcats::fct_lump_n(x, n_celltypes, other_level = "All remaining cell types", ties.method = "first"), .names = "{.col}_lumped" ) ) @@ -109,6 +109,56 @@ plot_umap <- function( aspect.ratio = 1 ) } + + +#' Create a faceted UMAP panel where each panel has only one cell type colored +#' +#' @param umap_df Data frame with UMAP1 and UMAP2 columns +#' @param annotation_column Column containing cell type annotations +#' +#' @return ggplot object containing a faceted UMAP where each cell type is a facet. +#' In each panel, the cell type of interest is colored and all other cells are grey. +faceted_umap <- function(umap_df, + annotation_column) { + # color by the annotation column but only color one cell type at a time + faceted_umap <- ggplot( + umap_df, + aes(x = UMAP1, y = UMAP2, color = {{ annotation_column }}) + ) + + # set points for all "other" points + geom_point( + data = select( + umap_df, -{{ annotation_column }} + ), + color = "gray80", + alpha = 0.5, + size = 0.3 + ) + + # set points for desired cell type + geom_point(size = 0.3, alpha = 0.5) + + facet_wrap(vars({{ annotation_column }})) + + scale_color_brewer(palette = "Dark2") + + # remove axis numbers and background grid + scale_x_continuous(labels = NULL, breaks = NULL) + + scale_y_continuous(labels = NULL, breaks = NULL) + + guides( + color = guide_legend( + title = "Cell types", + # more visible points in legend + override.aes = list( + alpha = 1, + size = 1.5 + ) + ) + ) + + theme( + legend.position = c(.9, 0), + legend.justification = c("right", "bottom"), + legend.title.align = 0.5 + ) + + return(faceted_umap) +} ``` @@ -268,7 +318,8 @@ if (length(levels(umap_df$cluster)) <= 8) { ```{r, eval = has_umap} knitr::asis_output( 'Next, we show UMAPs colored by cell types. -A separate UMAP is shown for each cell typing method used. +For each cell typing method, we show a separate faceted UMAP. +In each panel, cells that were assigned the given cell type label are colored, while all other cells are in grey. For legibility, only the seven most common cell types are shown. All other cell types are grouped together and labeled "All remaining cell types" (not to be confused with "Unknown cell type" which represents cells that could not be classified). @@ -278,37 +329,32 @@ All other cell types are grouped together and labeled "All remaining cell types" -```{r eval = has_submitter && has_umap, message=FALSE, warning=FALSE} -plot_umap( +```{r, eval = has_submitter && has_umap, message=FALSE, warning=FALSE, fig.height = 9, fig.width=9} +# umap for cell assign annotations +faceted_umap( umap_df, - submitter_celltype_annotation_lumped, - "Cell types", - legend_nrow = 4 + submitter_celltype_annotation_lumped ) + - ggtitle("UMAP colored by submitter-provided annotations") + - scale_color_brewer(palette = "Dark2") + ggtitle("UMAP colored by submitter-provided annotations") ``` -```{r eval=has_singler && has_umap, message=FALSE, warning=FALSE} -plot_umap( +```{r, eval=has_singler && has_umap, message=FALSE, warning=FALSE, fig.height=9, fig.width=9} +# umap for cell assign annotations +faceted_umap( umap_df, - singler_celltype_annotation_lumped, - "Cell types", - legend_nrow = 4 + singler_celltype_annotation_lumped ) + - ggtitle("UMAP colored by SingleR annotations") + - scale_color_brewer(palette = "Dark2") + ggtitle("UMAP colored by SingleR annotations") ``` -```{r, eval = has_cellassign && has_umap, message=FALSE, warning=FALSE} -plot_umap( + +```{r, eval = has_cellassign && has_umap, message=FALSE, warning=FALSE, fig.height = 9, fig.width=9} +# umap for cell assign annotations +faceted_umap( umap_df, - cellassign_celltype_annotation_lumped, - "Cell types", - legend_nrow = 4 + cellassign_celltype_annotation_lumped ) + - ggtitle("UMAP colored by CellAssign annotations") + - scale_color_brewer(palette = "Dark2") + ggtitle("UMAP colored by CellAssign annotations") ``` diff --git a/templates/qc_report/celltypes_supplemental_report.rmd b/templates/qc_report/celltypes_supplemental_report.rmd index 5522cdc7..970cf1fc 100644 --- a/templates/qc_report/celltypes_supplemental_report.rmd +++ b/templates/qc_report/celltypes_supplemental_report.rmd @@ -339,18 +339,19 @@ if (has_singler) { } if (has_cellassign) { - tissue_type <- stringr::word( - # eg, "blood-compartment" - metadata(processed_sce)$cellassign_reference, - 1, - sep = "-" - ) + organs <- metadata(processed_sce)$cellassign_reference_organs |> + stringr::str_to_lower() + # split up and add an `and` before the final organ + organs_string <- stringr::str_split_1(organs, pattern = ", ") |> + stringr::str_flatten_comma(last = ", and ") + + ref_name <- metadata(processed_sce)$cellassign_reference methods_text <- glue::glue( "{methods_text} * Annotations from [`CellAssign`](https://github.com/Irrationone/cellassign), a marker-gene-based approach ([Zhang _et al._ 2019](https://doi.org/10.1038/s41592-019-0529-1)). - Marker genes associated with `{tissue_type}` tissue were obtained from [PanglaoDB](https://panglaodb.se/).\n - " + Marker genes for cell types were obtained from [PanglaoDB](https://panglaodb.se/) and compiled into a reference named `{ref_name}`. + This reference includes the following organs and tissue compartments: {organs_string}.\n" ) } @@ -526,8 +527,8 @@ singler_cellassign_matrix <- make_jaccard_matrix( create_single_heatmap( singler_cellassign_matrix, - row_title = "SingleR annotations", - column_title = "CellAssign annotations", + row_title = "CellAssign annotations", + column_title = "SingleR annotations", labels_font_size = find_label_size(all_celltypes), keep_legend_name = "SingleR annotations", col_fun = heatmap_col_fun, @@ -748,7 +749,7 @@ ggplot(celltype_df) + ) + labs( x = "Probability of annotated cell type", - y = "" + y = "Cell type annotation" ) + scale_x_continuous(limits = c(0, 1)) + scale_y_continuous(expand = c(0.1, 0.1)) +