Skip to content

Commit

Permalink
refactor components; feature methods are supposed to return the same …
Browse files Browse the repository at this point in the history
…hvg space it receives as input
  • Loading branch information
rcannood committed Sep 24, 2024
1 parent 30ca2b6 commit 128b46f
Show file tree
Hide file tree
Showing 11 changed files with 73 additions and 91 deletions.
88 changes: 54 additions & 34 deletions src/data_processors/process_dataset/script.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import sys
import anndata as ad
import scanpy as sc
import openproblems as op

## VIASH START
par = {
'input': 'resources_test/common/pancreas/dataset.h5ad',
'hvgs': 2000,
'obs_label': 'cell_type',
'obs_batch': 'batch',
'subset_hvg': False,
'output': 'output.h5ad'
"input": "resources_test/common/pancreas/dataset.h5ad",
"hvgs": 2000,
"obs_label": "cell_type",
"obs_batch": "batch",
"output": "output.h5ad"
}
meta = {
"config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml",
Expand All @@ -18,60 +18,80 @@
## VIASH END

# import helper functions
sys.path.append(meta['resources_dir'])
sys.path.append(meta["resources_dir"])
from subset_h5ad_by_format import subset_h5ad_by_format

print(">> Load config", flush=True)
config = op.project.read_viash_config(meta["config"])

print('Read input', flush=True)
input = ad.read_h5ad(par['input'])
print("Read input", flush=True)
adata = ad.read_h5ad(par["input"])

def compute_batched_hvg(adata, n_hvgs):
if n_hvgs > adata.n_vars or n_hvgs <= 0:
hvg_list = adata.var_names.tolist()
else:
import scib
scib_adata = adata.copy()
scib_adata.X = scib_adata.layers['normalized'].copy()
del scib_adata.layers["counts"]
scib_adata.X = scib_adata.layers["normalized"].copy()
hvg_list = scib.pp.hvg_batch(
scib_adata,
batch_key='batch',
batch_key="batch",
target_genes=n_hvgs,
adataOut=False
)
adata.var['hvg'] = adata.var_names.isin(hvg_list)
return adata
return hvg_list

print(f'Select {par["hvgs"]} highly variable genes', flush=True)
adata_with_hvg = compute_batched_hvg(input, n_hvgs=par['hvgs'])
n_hvgs = par["hvgs"]
n_components = adata.obsm["X_pca"].shape[1]

if par['subset_hvg']:
print('Subsetting to HVG dimensions', flush=True)
adata_with_hvg = adata_with_hvg[:, adata_with_hvg.var['hvg']].copy()
if adata.n_vars < n_hvgs:
n_hvgs = adata.n_vars

print(">> Figuring out which data needs to be copied to which output file", flush=True)
# use par arguments to look for label and batch value in different slots
slot_mapping = {
"obs": {
"label": par["obs_label"],
"batch": par["obs_batch"],
}
if adata.n_vars > n_hvgs:
print(f"Select {par['hvgs']} highly variable genes", flush=True)
hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs)

print("Subsetting to HVG dimensions", flush=True)
adata = adata[:, adata.var_names.isin(hvg_list)].copy()

print(">> Recompute HVG", flush=True)
out = sc.pp.highly_variable_genes(
adata,
layer="normalized",
n_top_genes=n_hvgs,
flavor="cell_ranger",
inplace=False
)
adata.var["hvg"] = out["highly_variable"].values
adata.var["hvg_score"] = out["dispersions_norm"].values

print(">> Recompute PCA", flush=True)
X_pca, loadings, variance, variance_ratio = sc.pp.pca(
adata.layers["normalized"],
n_comps=n_components,
return_info=True
)
adata.obsm["X_pca"] = X_pca
adata.varm["pca_loadings"] = loadings.T
adata.uns["pca_variance"] = {
"variance": variance,
"variance_ratio": variance_ratio
}

print(">> Create output object", flush=True)
output_dataset = subset_h5ad_by_format(
adata_with_hvg,
adata,
config,
"output_dataset",
slot_mapping
"output_dataset"
)
output_solution = subset_h5ad_by_format(
adata_with_hvg,
adata,
config,
"output_solution",
slot_mapping
"output_solution"
)

print('Writing adatas to file', flush=True)
output_dataset.write_h5ad(par['output_dataset'], compression='gzip')
output_solution.write_h5ad(par['output_solution'], compression='gzip')
print("Writing adatas to file", flush=True)
output_dataset.write_h5ad(par["output_dataset"], compression="gzip")
output_solution.write_h5ad(par["output_solution"], compression="gzip")
1 change: 0 additions & 1 deletion src/methods/bbknn/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = read_anndata(
par['input'],
Expand Down
5 changes: 0 additions & 5 deletions src/methods/combat/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,6 @@ info:
combat_full_unscaled:
combat_full_scaled:
preferred_normalization: log_cp10k_scaled
arguments:
- name: --n_hvg
type: integer
default: 2000
description: Number of highly variable genes to use.
resources:
- type: python_script
path: script.py
Expand Down
10 changes: 0 additions & 10 deletions src/methods/combat/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@
par = {
'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
'output': 'output.h5ad',
'n_hvg': 2000,
}

meta = {
'name': 'foo',
'config': 'bar'
}

## VIASH END

sys.path.append(meta["resources_dir"])
Expand All @@ -28,16 +25,9 @@
uns='uns'
)

if par['n_hvg']:
print(f"Select top {par['n_hvg']} high variable genes", flush=True)
idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
adata = adata[:, idx].copy()


print('Run Combat', flush=True)
adata.X = sc.pp.combat(adata, key='batch', inplace=False)


print("Store output", flush=True)
output = sc.AnnData(
obs=adata.obs[[]],
Expand Down
2 changes: 1 addition & 1 deletion src/methods/harmony/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ output <- anndata::AnnData(
obs = adata$obs[, c()],
var = adata$var[, c()],
obsm = list(
X_emb = out,
X_emb = out
),
uns = list(
dataset_id = adata$uns[["dataset_id"]],
Expand Down
16 changes: 10 additions & 6 deletions src/methods/mnnpy/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@ description: |
This project is a python implementation of the MNN correct algorithm which takes advantage of python's extendability and hackability. It seamlessly integrates with the scanpy framework and has multicore support in its bones.
references:
doi: 10.1038/s41587-019-0113-3
bibtex: |
@misc{Kang2022,
author = {Kang, Chris},
title = {mnnpy},
year = {Kang2022},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/chriscainx/mnnpy}},
commit = {2097dec30c193f036c5ed7e1c3d1e3a6270e102b}
}
links:
repository: https://github.com/chriscainx/mnnpy
documentation: https://github.com/chriscainx/mnnpy#readme
Expand All @@ -20,11 +29,6 @@ info:
mnn_full_unscaled:
mnn_full_scaled:
preferred_normalization: log_cp10k_scaled
arguments:
- name: --n_hvg
type: integer
default: 2000
description: Number of highly variable genes to use.
resources:
- type: python_script
path: script.py
Expand Down
6 changes: 0 additions & 6 deletions src/methods/mnnpy/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
par = {
'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
'output': 'output.h5ad',
'n_hvg': 2000,
}
meta = {
'name': 'foo',
Expand All @@ -19,11 +18,6 @@
del adata.layers['normalized']
del adata.layers['counts']

if par['n_hvg']:
print(f"Select top {par['n_hvg']} high variable genes", flush=True)
idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
adata = adata[:, idx].copy()

print('Run mnn', flush=True)
split = []
batch_categories = adata.obs['batch'].cat.categories
Expand Down
5 changes: 0 additions & 5 deletions src/methods/scalex/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@ info:
scalex_feature_unscaled:
scanorama_feature_scaled:
preferred_normalization: log_cp10k_scaled
arguments:
- name: --n_hvg
type: integer
default: 2000
description: Number of highly variable genes to use.
resources:
- type: python_script
path: script.py
Expand Down
7 changes: 0 additions & 7 deletions src/methods/scalex/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
par = {
'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
'output': 'output.h5ad',
'hvg': True,
}
meta = {
'name' : 'foo',
Expand All @@ -27,12 +26,6 @@
uns='uns'
)


if par['n_hvg']:
print(f"Select top {par['n_hvg']} high variable genes", flush=True)
idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
adata = adata[:, idx].copy()

print('Run SCALEX', flush=True)
adata = scalex.SCALEX(
adata,
Expand Down
18 changes: 8 additions & 10 deletions src/methods/scanorama/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,22 @@ name: scanorama
label: Scanorama
summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama
description: |
"Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane."
Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets.
It is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods,
including those for imputation and highly-variable gene filtering. The results from
Scanorama integration and batch correction can then be used as input to other tools
for scRNA-seq clustering, visualization, and analysis.
references:
# Hie, B., Bryson, B. & Berger, B. Efficient integration of heterogeneous single-cell
# transcriptomes using Scanorama. Nat Biotechnol 37, 685–691 (2019).
# https://doi.org/10.1038/s41587-019-0113-3
doi: 10.1038/s41587-019-0113-3
links:
repository: https://github.com/brianhie/scanorama
documentation: https://github.com/brianhie/scanorama#readme
info:
method_types: [feature, embedding]
preferred_normalization: log_cp10k
variants:
scanorama_full_unscaled:
scanorama_full_scaled:
preferred_normalization: log_cp10k_scaled
arguments:
- name: --n_hvg
type: integer
default: 2000
description: Number of highly variable genes to use.
resources:
- type: python_script
path: script.py
Expand Down
6 changes: 0 additions & 6 deletions src/methods/scanorama/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
par = {
'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
'output': 'output.h5ad',
'n_hvg': 2000,
}
meta = {
'name': 'foo',
Expand Down Expand Up @@ -53,11 +52,6 @@ def merge_adata(*adata_list, **kwargs):
uns='uns'
)

if par['n_hvg']:
print(f"Select top {par['n_hvg']} high variable genes", flush=True)
idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
adata = adata[:, idx].copy()

print('Run scanorama', flush=True)
split = []
batch_categories = adata.obs['batch'].cat.categories
Expand Down

0 comments on commit 128b46f

Please sign in to comment.