generated from openproblems-bio/task_template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create scGPT component files * Load and preprocess data for scGPT * Add model argument and download model files * Embed data with scGPT and save output * Add scGPT to benchmark workflow * Style scGPT script --------- Co-authored-by: Robrecht Cannoodt <[email protected]>
- Loading branch information
Showing
4 changed files
with
154 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
__merge__: ../../api/comp_method.yaml | ||
|
||
name: scgpt | ||
label: scGPT | ||
summary: "A foundation model for single-cell biology" | ||
description: | | ||
scGPT is a foundation model for single-cell biology based on a generative | ||
pre-trained transformer and trained on a repository of over 33 million cells. | ||
Here, we use zero-shot output from a pre-trained model to get an integrated | ||
embedding for the batch integration task. | ||
references: | ||
doi: | ||
- 10.1038/s41592-024-02201-0 | ||
links: | ||
documentation: https://scgpt.readthedocs.io/en/latest/ | ||
repository: https://github.com/bowang-lab/scGPT | ||
|
||
info: | ||
method_types: [embedding] | ||
preferred_normalization: counts | ||
variants: | ||
scgpt_default: | ||
scgpt_cp: | ||
model: "scGPT_CP" | ||
|
||
arguments: | ||
- name: --model | ||
type: string | ||
description: String giving the scGPT model to use | ||
choices: ["scGPT_human", "scGPT_CP"] | ||
default: "scGPT_human" | ||
- name: --n_hvg | ||
type: integer | ||
default: 3000 | ||
description: Number of highly variable genes to use. | ||
|
||
resources: | ||
- type: python_script | ||
path: script.py | ||
- path: /src/utils/read_anndata_partial.py | ||
|
||
engines: | ||
- type: docker | ||
image: openproblems/base_pytorch_nvidia:1.0.0 | ||
# TODO: Try to find working installation of flash attention (flash-attn<1.0.5) | ||
setup: | ||
- type: python | ||
pypi: | ||
- gdown | ||
- scgpt # Install from PyPI to get dependencies | ||
- type: docker | ||
# Force re-installing from GitHub to get bug fixes | ||
run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git | ||
|
||
runners: | ||
- type: executable | ||
- type: nextflow | ||
directives: | ||
label: [midtime, midmem, midcpu, gpu] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
import sys | ||
import tempfile | ||
|
||
import anndata as ad | ||
import gdown | ||
import scgpt | ||
import torch | ||
|
||
## VIASH START | ||
# Note: this section is auto-generated by viash at runtime. To edit it, make changes | ||
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. | ||
par = { | ||
"input": "resources_test/.../input.h5ad", | ||
"output": "output.h5ad", | ||
"model": "scGPT_human", | ||
"n_hvg": 3000, | ||
} | ||
meta = {"name": "scgpt"} | ||
## VIASH END | ||
|
||
print(f"====== scGPT version {scgpt.__version__} ======", flush=True) | ||
|
||
sys.path.append(meta["resources_dir"]) | ||
from read_anndata_partial import read_anndata | ||
|
||
print("\n>>> Reading input files...", flush=True) | ||
print(f"Input H5AD file: '{par['input']}'", flush=True) | ||
adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") | ||
|
||
if adata.uns["dataset_organism"] != "homo_sapiens": | ||
raise ValueError( | ||
f"scGPT can only be used with human data " | ||
f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" | ||
) | ||
|
||
print(adata, flush=True) | ||
|
||
print("\n>>> Preprocessing data...", flush=True) | ||
if par["n_hvg"]: | ||
print(f"Selecting top {par['n_hvg']} highly variable genes", flush=True) | ||
idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][: par["n_hvg"]] | ||
adata = adata[:, idx].copy() | ||
|
||
print(adata, flush=True) | ||
|
||
print(f"\n>>> Downloading '{par['model']}' model...", flush=True) | ||
model_drive_ids = { | ||
"scGPT_human": "1oWh_-ZRdhtoGQ2Fw24HP41FgLoomVo-y", | ||
"scGPT_CP": "1_GROJTzXiAV8HB4imruOTk6PEGuNOcgB", | ||
} | ||
drive_path = f"https://drive.google.com/drive/folders/{model_drive_ids[par['model']]}" | ||
model_dir = tempfile.TemporaryDirectory() | ||
print(f"Downloading from '{drive_path}'", flush=True) | ||
gdown.download_folder(drive_path, output=model_dir.name, quiet=True) | ||
print(f"Model directory: '{model_dir.name}'", flush=True) | ||
|
||
print("\n>>> Embedding data...", flush=True) | ||
device = "cuda" if torch.cuda.is_available() else "cpu" | ||
print(f"Device: '{device}'", flush=True) | ||
embedded = scgpt.tasks.embed_data( | ||
adata, | ||
model_dir.name, | ||
gene_col="feature_name", | ||
batch_size=64, | ||
use_fast_transformer=False, # Disable fast-attn as not installed | ||
device=device, | ||
return_new_adata=True, | ||
) | ||
|
||
print("\n>>> Storing output...", flush=True) | ||
output = ad.AnnData( | ||
obs=adata.obs[[]], | ||
var=adata.var[[]], | ||
obsm={ | ||
"X_emb": embedded.X, | ||
}, | ||
uns={ | ||
"dataset_id": adata.uns["dataset_id"], | ||
"normalization_id": adata.uns["normalization_id"], | ||
"method_id": meta["name"], | ||
}, | ||
) | ||
print(output) | ||
|
||
print("\n>>> Writing output to file...", flush=True) | ||
print(f"Output H5AD file: '{par['output']}'", flush=True) | ||
output.write_h5ad(par["output"], compression="gzip") | ||
|
||
print("\n>>> Cleaning up temporary directories...", flush=True) | ||
model_dir.cleanup() | ||
|
||
print("\n>>> Done!", flush=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters