AlexsLemonade · allyhawkins · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024 · Apr 11, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
         args: [--update-only, --title=**Table of Contents**]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff for linting and formatting python
-    rev: v0.3.2
+    rev: v0.3.3
     hooks:
       # Run the linter.
       - id: ruff
@@ -62,5 +62,6 @@ repos:
 ci:
   autofix_prs: true # set to false if we don't want fixes automatically applied by ci
   autoupdate_branch: development
+  autoupdate_schedule: quarterly
   # skip spell check because it can't be quickly installed on CI
   skip: [spell-check]
diff --git a/bin/add_celltypes_to_sce.R b/bin/add_celltypes_to_sce.R
@@ -187,7 +187,13 @@ if (!is.null(opt$cellassign_predictions)) {
     stop("Cell type reference metadata filename must be provided")
   }
 
-  predictions <- readr::read_tsv(opt$cellassign_predictions)
+  # if the predictions file isn't emtpy read it in
+  if (file.size(opt$cellassign_predictions) > 0) {
+    predictions <- readr::read_tsv(opt$cellassign_predictions)
+  } else {
+    # if it's empty, then sce could not be converted to anndata and cell assign was not run
+    sce$cellassign_celltype_annotation <- "Not run"
+  }
 
   # if the only column is the barcode column then CellAssign didn't complete successfully
   # otherwise add in cell type annotations and metadata to SCE

diff --git a/bin/add_demux_sce.R b/bin/add_demux_sce.R
@@ -107,21 +107,27 @@ cellhash_ids <- rownames(altExp(sce))
 
 # only perform demultiplexing if more than one HTO is detected
 if (length(cellhash_ids) > 1) {
+  demux_methods <- c()
   # add HashedDrops results
   if (opt$hash_demux) {
     sce <- scpcaTools::add_demux_hashedDrops(sce)
+    demux_methods <- c(demux_methods, "hashedDrops")
   }
 
   # add Seurat results
   if (opt$seurat_demux) {
     sce <- scpcaTools::add_demux_seurat(sce)
+    demux_methods <- c(demux_methods, "HTODemux")
   }
 
   # add vireo results
   if (!is.null(opt$vireo_dir)) {
     vireo_table <- readr::read_tsv(vireo_file)
     sce <- scpcaTools::add_demux_vireo(sce, vireo_table)
+    demux_methods <- c(demux_methods, "vireo")
   }
+
+  metadata(sce)$demux_methods <- demux_methods
 }
 
 # write filtered sce to output

diff --git a/bin/filter_sce.R b/bin/filter_sce.R
@@ -95,14 +95,19 @@ filtered_sce <- scpcaTools::filter_counts(unfiltered_sce)
 # remove unfiltered for memory saving
 rm(unfiltered_sce)
 
+# if no cells left, write out empty file and quit, otherwise continue processing
+if (ncol(filtered_sce) == 0) {
+  # make an empty filtered file
+  file.create(opt$filtered_file)
+  quit(save = "no")
+}
+
 # need to remove old gene-level rowData statistics first
 drop_cols <- colnames(rowData(filtered_sce)) %in% c("mean", "detected")
 rowData(filtered_sce) <- rowData(filtered_sce)[!drop_cols]
-
 # recalculate rowData and add to filtered sce
 filtered_sce <- filtered_sce |>
   scuttle::addPerFeatureQCMetrics()
-
 # add prob_compromised to colData and miQC model to metadata
 # since this can fail, we will check for success
 miQC_worked <- FALSE
@@ -115,28 +120,22 @@ try({
   metadata(filtered_sce)$prob_compromised_cutoff <- opt$prob_compromised_cutoff
   miQC_worked <- TRUE
 })
-
 # set prob_compromised to NA if miQC failed
 if (!miQC_worked) {
   warning("miQC failed. Setting `prob_compromised` to NA.")
   filtered_sce$prob_compromised <- NA_real_
   filtered_sce$miQC_pass <- NA
   metadata(filtered_sce)$prob_compromised_cutoff <- NA
 }
-
 # grab names of altExp, if any
 alt_names <- altExpNames(filtered_sce)
-
 for (alt in alt_names) {
   # remove old row data from unfiltered
   drop_cols <- colnames(rowData(altExp(filtered_sce, alt))) %in% c("mean", "detected")
   rowData(altExp(filtered_sce, alt)) <- rowData(altExp(filtered_sce, alt))[!drop_cols]
-
   # add alt experiment features stats for filtered data
   altExp(filtered_sce, alt) <- scuttle::addPerFeatureQCMetrics(altExp(filtered_sce, alt))
 }
-
-
 # calculate filtering QC from ADTs, if present
 if (!is.null(ambient_profile)) {
   # Create data frame of ADTs and their target types
@@ -146,7 +145,6 @@ if (!is.null(ambient_profile)) {
     # if only 2 columns exist, only the first two col_names will be used
     col_names = c("name", "barcode", "target_type")
   )
-
   # Assign default of `target` if no third column provided
   if (!"target_type" %in% names(adt_barcode_df)) {
     adt_barcode_df$target_type <- "target"
@@ -164,17 +162,14 @@ if (!is.null(ambient_profile)) {
       )
     )
   }
-
   # Check that barcode file ADTs match SCE ADTs
   if (!all.equal(sort(adt_barcode_df$name), sort(rownames(altExp(filtered_sce, adt_exp))))) {
     stop("Mismatch between provided ADT barcode file and ADTs in SCE.")
   }
-
   # Find any negative controls
   neg_controls <- adt_barcode_df |>
     dplyr::filter(target_type == "neg_control") |>
     dplyr::pull(name)
-
   # Calculate QC stats, providing negative controls if present
   # note: function fails if controls is length 0 or null, so keep the `if`
   if (length(neg_controls) == 0) {
@@ -189,17 +184,13 @@ if (!is.null(ambient_profile)) {
       controls = neg_controls
     )
   }
-
   # Save QC stats to the altexp
   colData(altExp(filtered_sce, adt_exp)) <- cbind(
     colData(altExp(filtered_sce, adt_exp)),
     adt_qc_df
   )
-
   # Add `target_type` to rowData
   rowData(altExp(filtered_sce, adt_exp))$target_type <- adt_barcode_df$target_type
 }
-
-
 # write filtered sce to output
 readr::write_rds(filtered_sce, opt$filtered_file, compress = "bz2")
diff --git a/bin/generate_unfiltered_sce.R b/bin/generate_unfiltered_sce.R
@@ -161,7 +161,7 @@ if (opt$feature_dir != "") {
 
 
 # read in sample metadata
-sample_metadata_df <- readr::read_tsv(opt$sample_metadata_file) |>
+sample_metadata_df <- readr::read_tsv(opt$sample_metadata_file, col_types = "c") |>
   # rename sample id column
   dplyr::rename("sample_id" = "scpca_sample_id") |>
   # add library ID as column in sample metadata
@@ -200,7 +200,7 @@ sample_type <- sample_metadata_df |>
       is_xenograft ~ "patient-derived xenograft",
       is_cell_line ~ "cell line",
       # if neither column was provided, note that
-      is.na(is_xenograft) && is.na(is_cell_line) ~ "Not provided",
+      is.na(is_xenograft) & is.na(is_cell_line) ~ "Not provided",
       .default = "patient tissue"
     )
   ) |>

diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py
@@ -18,8 +18,8 @@
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "-i",
-    "--input_hdf5_file",
-    dest="input_hdf5_file",
+    "--anndata_file",
+    dest="anndata_file",
     required=True,
     help="Path to HDF5 file with processed AnnData object to annotate",
 )
@@ -63,14 +63,14 @@
 scvi.settings.num_threads = args.threads
 
 # compile extension regex
-file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)
+file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
 
 # check that input file exists, if it does exist, make sure it's an h5 file
-if not os.path.exists(args.input_hdf5_file):
-    raise FileExistsError("--input_hdf5_file file not found.")
-elif not file_ext.search(args.input_hdf5_file):
+if not os.path.exists(args.anndata_file):
+    raise FileExistsError("--anndata_file file not found.")
+elif not file_ext.search(args.anndata_file):
     raise ValueError(
-        "--input_hdf5_file must end in either .hdf5 or .h5 and contain a processed AnnData object."
+        "--anndata_file must end in .h5ad, .hdf5, or .h5 and contain a processed AnnData object."
     )
 
 # check that marker file exists and make sure its a tsv
@@ -85,7 +85,7 @@
 ref_matrix = pd.read_csv(args.reference, sep="\t", index_col="ensembl_id")
 
 # file path to annotated sce
-annotated_adata = adata.read_h5ad(args.input_hdf5_file)
+annotated_adata = adata.read_h5ad(args.anndata_file)
 
 # subset anndata to contain only genes in the reference file
 # note that the gene names must be the rownames of the reference matrix
@@ -95,7 +95,7 @@
 # check that shared_genes actually has some genes
 if not shared_genes:
     raise ValueError(
-        "--reference does not include any genes found in the provided --input_hdf5_file."
+        "--reference does not include any genes found in the provided --anndata_file."
     )
 
 # create a new anndata object with only shared genes

diff --git a/bin/sce_qc_report.R b/bin/sce_qc_report.R
@@ -102,12 +102,6 @@ option_list <- list(
     default = NA,
     help = "workflow commit hash"
   ),
-  make_option(
-    opt_str = "--demux_method",
-    type = "character",
-    default = "vireo",
-    help = "demultiplexing method to use for multiplexed samples. One of `vireo`, `HTOdemux`, or `HashedDrops`"
-  ),
   make_option(
     opt_str = "--seed",
     type = "integer",
@@ -129,13 +123,6 @@ if (!is.null(opt$report_template) && !file.exists(opt$report_template)) {
 if (is.null(opt$unfiltered_sce) || !file.exists(opt$unfiltered_sce)) {
   stop("Unfiltered .rds file missing or `unfiltered_sce` not specified.")
 }
-if (is.null(opt$filtered_sce) || !file.exists(opt$filtered_sce)) {
-  stop("Filtered .rds file missing or `filtered_sce` not specified.")
-}
-demux_methods <- c("vireo", "HTODemux", "HashedDrops")
-if (!opt$demux_method %in% demux_methods) {
-  stop("Unknown `demux_method` value. Must be one of `vireo`, `HTOdemux`, or `HashedDrops`")
-}
 
 if (opt$workflow_url == "null") {
   opt$workflow_url <- NA
@@ -147,22 +134,28 @@ if (opt$workflow_commit == "null") {
   opt$workflow_commit <- NA
 }
 
-# read sce files
+# read sce files and compile metadata for output files
 unfiltered_sce <- readr::read_rds(opt$unfiltered_sce)
-filtered_sce <- readr::read_rds(opt$filtered_sce)
+sce_meta <- metadata(unfiltered_sce)
+
+# make sure filtered sce has an object, otherwise set to NULL
+if (file.size(opt$filtered_sce) > 0) {
+  filtered_sce <- readr::read_rds(opt$filtered_sce)
+  filtered_sce_meta <- metadata(filtered_sce)
+} else {
+  filtered_sce <- NULL
+  filtered_sce_meta <- NULL
+}
 
 # make sure processed sce has an object, otherwise set to NULL
 if (file.size(opt$processed_sce) > 0) {
   processed_sce <- readr::read_rds(opt$processed_sce)
+  processed_sce_meta <- metadata(processed_sce)
 } else {
   processed_sce <- NULL
+  processed_sce_meta <- NULL
 }
 
-# Compile metadata for output files
-sce_meta <- metadata(unfiltered_sce)
-filtered_sce_meta <- metadata(filtered_sce)
-processed_sce_meta <- metadata(processed_sce)
-
 # Parse sample ids
 sample_ids <- unlist(stringr::str_split(opt$sample_id, ",|;")) |> sort()
 
@@ -233,17 +226,29 @@ if (has_citeseq) {
 
 # estimate cell counts for multiplexed samples
 if (multiplexed) {
-  demux_column <- paste0(opt$demux_method, "_sampleid")
+  # grab demux method to use for sample cell estimate from metadata
+  demux_methods <- filtered_sce_meta$demux_methods
+
+  # use vireo by default, otherwise use the first one in the list
+  if ("vireo" %in% demux_methods) {
+    demux_method <- "vireo"
+  } else {
+    demux_method <- demux_methods[1]
+  }
+
+  # get cell count estimates
+  demux_column <- paste0(demux_method, "_sampleid")
   demux_counts <- colData(filtered_sce)[[demux_column]] |>
+    factor(levels = sample_ids) |> # use a factor get any zero counts
     table() |>
     as.list() # save as a list for json output
 
   # add demux info to the metadata list
   metadata_list <- append(
     metadata_list,
     list(
-      demux_method = opt$demux_method,
-      demux_samples = sample_ids,
+      demux_method = demux_method,
+      demux_samples = names(demux_counts), # make sure order of sample ids matches counts order
       sample_cell_estimates = demux_counts
     )
   )

diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
@@ -21,7 +21,7 @@ option_list <- list(
   make_option(
     opt_str = c("--output_rna_h5"),
     type = "character",
-    help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5 or .h5"
+    help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
   ),
   make_option(
     opt_str = c("--feature_name"),
@@ -131,6 +131,11 @@ format_czi <- function(sce) {
 # read in sce
 sce <- readr::read_rds(opt$input_sce_file)
 
+# if not enough cells to convert, quit and don't do anything
+if (ncol(sce) < 2) {
+  quit(save = "no")
+}
+
 # grab sample metadata
 # we need this if we have any feature data that we need to add it o
 sample_metadata <- metadata(sce)$sample_metadata
@@ -169,8 +174,8 @@ if (!is.null(opt$feature_name)) {
     # convert altExp
   } else {
     # check for output file
-    if (!(stringr::str_ends(opt$output_feature_h5, ".hdf5|.h5"))) {
-      stop("output feature file name must end in .hdf5 or .h5")
+    if (!(stringr::str_ends(opt$output_feature_h5, ".h5ad|.hdf5|.h5"))) {
+      stop("output feature file name must end in .h5ad, .hdf5, or .h5")
     }
 
     # extract altExp

diff --git a/examples/example_multiplex_pools.tsv b/examples/example_multiplex_pools.tsv
@@ -1,5 +1,3 @@
 scpca_library_id	scpca_sample_id	barcode_id
-SCPCL999994	SCPCS999993	TAG01
-SCPCL999994	SCPCS999994	TAG02
-SCPCL999995	SCPCS999995	TAG01
-SCPCL999995	SCPCS999996	TAG02
+library03	sample03	TAG01
+library03	sample04	TAG02