Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync changes from development into main for v0.8.3 #779

Merged
merged 18 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 0 additions & 55 deletions bin/move_counts_anndata.py

This file was deleted.

4 changes: 2 additions & 2 deletions bin/predict_cellassign.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
scvi.settings.num_threads = args.threads

# compile extension regex
file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.anndata_file):
Expand All @@ -78,7 +78,7 @@
raise FileExistsError("--reference file not found.")

# make sure output file path is tsv file
if not args.output_predictions.endswith(".tsv"):
if not args.output_predictions.casefold().endswith(".tsv"):
raise ValueError("--output_predictions must provide a file path ending in tsv")

# read in references as marker gene tables
Expand Down
118 changes: 118 additions & 0 deletions bin/reformat_anndata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/usr/bin/env python3
"""
This script takes an AnnData object and checks for the `logcounts` in layers.
If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X`

In addition, any DataFrames in `obsm` are conerted to ndarrays, highly variable genes are converted to a `var` column.
If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy.
"""

import argparse
import os
import re

import anndata
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument(
"-i",
"--anndata_file",
dest="anndata_file",
required=True,
help="Path to HDF5 file with processed AnnData object",
)
parser.add_argument(
"--pca_meta_file",
dest="pca_meta_file",
required=False,
help="Path to file with a table of variance explained by each PCA component",
)
parser.add_argument(
"--pca_not_centered",
dest="pca_centered",
action="store_false",
help="Indicate that the PCA table is not zero-centered",
)
parser.add_argument(
"--hvg_name",
dest="hvg_name",
default="highly_variable_genes",
help=(
"Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object."
"Use the value 'none' if no highly variable genes were used."
),
)
parser.add_argument(
"-u",
"--uncompressed",
dest="compress",
action="store_false",
help="Output an uncompressed HDF5 file",
)

args = parser.parse_args()

# compile extension regex
file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE)

# check that input file exists, if it does exist, make sure it's an h5 file
if not os.path.exists(args.anndata_file):
raise FileExistsError("`input_anndata` does not exist.")
elif not file_ext.search(args.anndata_file):
raise ValueError(
"input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata."
)

# if there is a pca_meta_file, check that it exists and has a tsv extension
if args.pca_meta_file:
if not os.path.exists(args.pca_meta_file):
raise FileExistsError("`pca_meta_file` does not exist.")
elif not args.pca_meta_file.casefold().endswith(".tsv"):
raise ValueError("`pca_meta_file` must end in .tsv.")

# read in anndata
adata = anndata.read_h5ad(args.anndata_file)

# if logcounts is present
if "logcounts" in adata.layers:
# move X to raw.X by creating the raw adata
adata.raw = adata
# move logcounts to X and rename
adata.X = adata.layers["logcounts"]
adata.uns["X_name"] = "logcounts"
del adata.layers["logcounts"]

# convert DataFrames in obsm to ndarrays
for key, value in adata.obsm.items():
if isinstance(value, pd.DataFrame):
adata.obsm[key] = value.to_numpy()

# convert highly variable genes to a column if given
use_hvg = args.hvg_name.casefold() != "none"
if use_hvg:
if args.hvg_name not in adata.uns.keys():
raise ValueError("`hvg_name` must be present in the `uns` data for the object")
adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name])


# add pca adata to uns if pca_meta_file is provided in the format created by scanpy
if args.pca_meta_file:
pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0)
if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns:
raise ValueError(
"`pca_meta_file` must contain columns `variance` and `variance_ratio`"
)
pca_object = {
"param": {
"zero_center": args.pca_centered,
"use_highly_variable": use_hvg,
"mask_var": ("highly_variable" if use_hvg else None),
},
"variance": pca_meta["variance"].to_numpy(),
"variance_ratio": pca_meta["variance_ratio"].to_numpy(),
}
adata.uns["pca"] = pca_object

# export adata
adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None)
30 changes: 27 additions & 3 deletions bin/sce_to_anndata.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ option_list <- list(
type = "character",
help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
),
make_option(
opt_str = c("--output_pca_tsv"),
default = NULL,
type = "character",
help = "path to output a table of variance explained by each principal component. Must end in .tsv"
),
make_option(
opt_str = c("--feature_name"),
type = "character",
Expand Down Expand Up @@ -62,12 +68,17 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) {
stop("output rna file name must end in .hdf5, .h5, or .h5ad")
}

# check that the pca file is a tsv
if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) {
stop("output pca file name must end in .tsv")
}

# Merged object function ------------------------------------------------------

# this function updates merged object formatting for anndata export
format_merged_sce <- function(sce) {
# paste X to any present reduced dim names
reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")
return(sce)
}

Expand Down Expand Up @@ -120,8 +131,8 @@ format_czi <- function(sce) {
# so everything gets set to false
rowData(sce)$feature_is_filtered <- FALSE

# paste X to any present reduced dim names
reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}")
# paste X to any present reduced dim names, converting to lower case
reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}")

return(sce)
}
Expand Down Expand Up @@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata(
compression = ifelse(opt$compress_output, "gzip", "none")
)

# Get PCA metadata for AnnData
if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) {
pca_meta_df <- data.frame(
PC = 1:ncol(reducedDims(sce)$X_pca),
variance = attr(reducedDims(sce)$X_pca, "varExplained"),
variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100
)

# write pca to tsv
readr::write_tsv(pca_meta_df, opt$output_pca_tsv)
}

message("Exported RNA")

# AltExp to AnnData -----------------------------------------------------------
Expand Down Expand Up @@ -209,5 +232,6 @@ if (!is.null(opt$feature_name)) {
")
)
}
message("Exported alt")
}
}
6 changes: 3 additions & 3 deletions external-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,12 @@ Using the above command will run the workflow from the `main` branch of the work
To update to the latest released version you can run `nextflow pull AlexsLemonade/scpca-nf` before the `nextflow run` command.

To be sure that you are using a consistent version, you can specify use of a release tagged version of the workflow, set below with the `-r` flag.
The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.2` version.
The command below will pull the `scpca-nf` workflow directly from Github using the `v0.8.3` version.
Released versions can be found on the [`scpca-nf` repository releases page](https://github.com/AlexsLemonade/scpca-nf/releases).

```sh
nextflow run AlexsLemonade/scpca-nf \
-r v0.8.2 \
-r v0.8.3 \
-config <path to config file> \
-profile <name of profile>
```
Expand Down Expand Up @@ -325,7 +325,7 @@ If you will be analyzing spatial expression data, you will also need the Cell Ra

If your compute nodes do not have internet access, you will likely have to pre-pull the required container images as well.
When doing this, it is important to be sure that you also specify the revision (version tag) of the `scpca-nf` workflow that you are using.
For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.2`, then you will want to set `-r v0.8.2` for `get_refs.py` as well to be sure you have the correct containers.
For example, if you would run `nextflow run AlexsLemonade/scpca-nf -r v0.8.3`, then you will want to set `-r v0.8.3` for `get_refs.py` as well to be sure you have the correct containers.
By default, `get_refs.py` will download files and images associated with the latest release.

If your system uses Docker, you can add the `--docker` flag:
Expand Down
2 changes: 1 addition & 1 deletion internal-instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ Please refer to our [`CONTRIBUTING.md`](CONTRIBUTING.md#stub-workflows) for more
When running the workflow for a project or group of samples that is ready to be released on ScPCA portal, please use the tag for the latest release:

```
nextflow run AlexsLemonade/scpca-nf -r v0.8.2 -profile ccdl,batch --project SCPCP000000
nextflow run AlexsLemonade/scpca-nf -r v0.8.3 -profile ccdl,batch --project SCPCP000000
```

### Processing example data
Expand Down
28 changes: 22 additions & 6 deletions merge.nf
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ process export_anndata {
${has_adt ? "--feature_name adt" : ''}

# move normalized counts to X in AnnData
move_counts_anndata.py --anndata_file ${rna_h5ad_file}
${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
reformat_anndata.py --anndata_file ${rna_h5ad_file}
${has_adt ? "reformat_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
"""
stub:
rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
Expand Down Expand Up @@ -144,11 +144,11 @@ workflow {
|| (it.scpca_sample_id in run_ids)
}
.map{[
seq_unit: it.seq_unit,
technology: it.technology,
project_id: it.scpca_project_id,
library_id: it.scpca_library_id,
sample_id: it.scpca_sample_id.split(";").sort().join(",")
sample_id: it.scpca_sample_id.split(";").sort().join(","),
seq_unit: it.seq_unit,
technology: it.technology
]}

// get all projects that contain at least one library with CITEseq
Expand All @@ -162,13 +162,23 @@ workflow {
.collect{it.project_id}
.map{it.unique()}

oversized_projects = libraries_ch
.map{[
it.project_id, // pull out project id for grouping
it
]}
.groupTuple(by: 0) // group by project id
.filter{it[1].size() > params.max_merge_libraries} // get projects with more samples than max merge
.collect{it[0]} // get project id

filtered_libraries_ch = libraries_ch
// only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data
.filter{it.seq_unit in ['cell', 'nucleus']}
// remove any multiplexed projects
// remove any multiplexed projects or oversized projects
// future todo: only filter library ids that are multiplexed, but keep all other non-multiplexed libraries
.branch{
multiplexed: it.project_id in multiplex_projects.getVal()
oversized: it.project_id in oversized_projects.getVal()
single_sample: true
}

Expand All @@ -178,6 +188,12 @@ workflow {
log.warn("Not merging ${it.project_id} because it contains multiplexed libraries.")
}

filtered_libraries_ch.oversized
.unique{ it.project_id }
.subscribe{
log.warn("Not merging ${it.project_id} because it contains too many libraries.")
}

// print out warning message for any libraries not included in merging
filtered_libraries_ch.single_sample
.map{[
Expand Down
8 changes: 5 additions & 3 deletions modules/export-anndata.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,23 @@ process export_anndata {
script:
rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
pca_meta_file = "${meta.library_id}_${file_type}_pca.tsv"
feature_present = meta.feature_type in ["adt"]
"""
sce_to_anndata.R \
--input_sce_file ${sce_file} \
--output_rna_h5 ${rna_h5ad_file} \
--output_feature_h5 ${feature_h5ad_file} \
--output_pca_tsv ${pca_meta_file} \
${feature_present ? "--feature_name ${meta.feature_type}" : ''} \
${file_type != "processed" ? "--compress_output" : ''}

# move any normalized counts to X in AnnData
# move any normalized counts to X in AnnData, convert matrices, and add PCA metadata
if [ "${file_type}" = "processed" ]; then
move_counts_anndata.py --anndata_file ${rna_h5ad_file}
reformat_anndata.py --anndata_file ${rna_h5ad_file} --pca_meta_file ${pca_meta_file}
# move counts in feature data, if the file exists
if [ -f "${feature_h5ad_file}" ]; then
move_counts_anndata.py --anndata_file ${feature_h5ad_file}
reformat_anndata.py --anndata_file ${feature_h5ad_file} --hvg_name "none"
fi
fi

Expand Down
Loading
Loading