refactor components; feature methods are supposed to return the same …

…hvg space it receives as input
openproblems-bio · Sep 24, 2024 · 128b46f · 128b46f
1 parent 30ca2b6
commit 128b46f
Show file tree

Hide file tree

Showing 11 changed files with 73 additions and 91 deletions.
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
@@ -1,15 +1,15 @@
 import sys
 import anndata as ad
+import scanpy as sc
 import openproblems as op
 
 ## VIASH START
 par = {
-    'input': 'resources_test/common/pancreas/dataset.h5ad',
-    'hvgs': 2000,
-    'obs_label': 'cell_type',
-    'obs_batch': 'batch',
-    'subset_hvg': False,
-    'output': 'output.h5ad'
+    "input": "resources_test/common/pancreas/dataset.h5ad",
+    "hvgs": 2000,
+    "obs_label": "cell_type",
+    "obs_batch": "batch",
+    "output": "output.h5ad"
 }
 meta = {
     "config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml",
@@ -18,60 +18,80 @@
 ## VIASH END
 
 # import helper functions
-sys.path.append(meta['resources_dir'])
+sys.path.append(meta["resources_dir"])
 from subset_h5ad_by_format import subset_h5ad_by_format
 
 print(">> Load config", flush=True)
 config = op.project.read_viash_config(meta["config"])
 
-print('Read input', flush=True)
-input = ad.read_h5ad(par['input'])
+print("Read input", flush=True)
+adata = ad.read_h5ad(par["input"])
 
 def compute_batched_hvg(adata, n_hvgs):
     if n_hvgs > adata.n_vars or n_hvgs <= 0:
         hvg_list = adata.var_names.tolist()
     else:
         import scib
         scib_adata = adata.copy()
-        scib_adata.X = scib_adata.layers['normalized'].copy()
+        del scib_adata.layers["counts"]
+        scib_adata.X = scib_adata.layers["normalized"].copy()
         hvg_list = scib.pp.hvg_batch(
             scib_adata,
-            batch_key='batch',
+            batch_key="batch",
             target_genes=n_hvgs,
             adataOut=False
         )
-    adata.var['hvg'] = adata.var_names.isin(hvg_list)
-    return adata
+    return hvg_list
 
-print(f'Select {par["hvgs"]} highly variable genes', flush=True)
-adata_with_hvg = compute_batched_hvg(input, n_hvgs=par['hvgs'])
+n_hvgs = par["hvgs"]
+n_components = adata.obsm["X_pca"].shape[1]
 
-if par['subset_hvg']:
-    print('Subsetting to HVG dimensions', flush=True)
-    adata_with_hvg = adata_with_hvg[:, adata_with_hvg.var['hvg']].copy()
+if adata.n_vars < n_hvgs:
+    n_hvgs = adata.n_vars
 
-print(">> Figuring out which data needs to be copied to which output file", flush=True)
-# use par arguments to look for label and batch value in different slots
-slot_mapping = {
-    "obs": {
-        "label": par["obs_label"],
-        "batch": par["obs_batch"],
-    }
+if adata.n_vars > n_hvgs:
+    print(f"Select {par['hvgs']} highly variable genes", flush=True)
+    hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs)
+
+    print("Subsetting to HVG dimensions", flush=True)
+    adata = adata[:, adata.var_names.isin(hvg_list)].copy()
+
+print(">> Recompute HVG", flush=True)
+out = sc.pp.highly_variable_genes(
+  adata,
+  layer="normalized",
+  n_top_genes=n_hvgs,
+  flavor="cell_ranger",
+  inplace=False
+)
+adata.var["hvg"] = out["highly_variable"].values
+adata.var["hvg_score"] = out["dispersions_norm"].values
+
+print(">> Recompute PCA", flush=True)
+X_pca, loadings, variance, variance_ratio = sc.pp.pca(
+    adata.layers["normalized"], 
+    n_comps=n_components,
+    return_info=True
+)
+adata.obsm["X_pca"] = X_pca
+adata.varm["pca_loadings"] = loadings.T
+adata.uns["pca_variance"] = {
+    "variance": variance, 
+    "variance_ratio": variance_ratio
 }
+
 print(">> Create output object", flush=True)
 output_dataset = subset_h5ad_by_format(
-    adata_with_hvg,
+    adata,
     config,
-    "output_dataset",
-    slot_mapping
+    "output_dataset"
 )
 output_solution = subset_h5ad_by_format(
-    adata_with_hvg,
+    adata,
     config,
-    "output_solution",
-    slot_mapping
+    "output_solution"
 )
 
-print('Writing adatas to file', flush=True)
-output_dataset.write_h5ad(par['output_dataset'], compression='gzip')
-output_solution.write_h5ad(par['output_solution'], compression='gzip')
+print("Writing adatas to file", flush=True)
+output_dataset.write_h5ad(par["output_dataset"], compression="gzip")
+output_solution.write_h5ad(par["output_solution"], compression="gzip")
diff --git a/src/methods/bbknn/script.py b/src/methods/bbknn/script.py
@@ -20,7 +20,6 @@
 sys.path.append(meta["resources_dir"])
 from read_anndata_partial import read_anndata
 
-
 print('Read input', flush=True)
 adata = read_anndata(
     par['input'],

diff --git a/src/methods/combat/config.vsh.yaml b/src/methods/combat/config.vsh.yaml
@@ -22,11 +22,6 @@ info:
     combat_full_unscaled:
     combat_full_scaled:
       preferred_normalization: log_cp10k_scaled
-arguments:
-  - name: --n_hvg
-    type: integer
-    default: 2000
-    description: Number of highly variable genes to use.
 resources:
   - type: python_script
     path: script.py

diff --git a/src/methods/combat/script.py b/src/methods/combat/script.py
@@ -6,14 +6,11 @@
 par = {
     'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
     'output': 'output.h5ad',
-    'n_hvg': 2000,
 }
-
 meta = {
     'name': 'foo',
     'config': 'bar'
 }
-
 ## VIASH END
 
 sys.path.append(meta["resources_dir"])
@@ -28,16 +25,9 @@
     uns='uns'
 )
 
-if par['n_hvg']:
-    print(f"Select top {par['n_hvg']} high variable genes", flush=True)
-    idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
-    adata = adata[:, idx].copy()
-
-
 print('Run Combat', flush=True)
 adata.X = sc.pp.combat(adata, key='batch', inplace=False)
 
-
 print("Store output", flush=True)
 output = sc.AnnData(
     obs=adata.obs[[]],

diff --git a/src/methods/harmony/script.R b/src/methods/harmony/script.R
@@ -27,7 +27,7 @@ output <- anndata::AnnData(
   obs = adata$obs[, c()],
   var = adata$var[, c()],
   obsm = list(
-    X_emb = out,
+    X_emb = out
   ),
   uns = list(
     dataset_id = adata$uns[["dataset_id"]],

diff --git a/src/methods/mnnpy/config.vsh.yaml b/src/methods/mnnpy/config.vsh.yaml
@@ -9,7 +9,16 @@ description: |
 
   This project is a python implementation of the MNN correct algorithm which takes advantage of python's extendability and hackability. It seamlessly integrates with the scanpy framework and has multicore support in its bones.
 references:
-  doi: 10.1038/s41587-019-0113-3
+  bibtex: |
+    @misc{Kang2022,
+      author = {Kang, Chris},
+      title = {mnnpy},
+      year = {Kang2022},
+      publisher = {GitHub},
+      journal = {GitHub repository},
+      howpublished = {\url{https://github.com/chriscainx/mnnpy}},
+      commit = {2097dec30c193f036c5ed7e1c3d1e3a6270e102b}
+    }
 links:
   repository: https://github.com/chriscainx/mnnpy
   documentation: https://github.com/chriscainx/mnnpy#readme
@@ -20,11 +29,6 @@ info:
     mnn_full_unscaled:
     mnn_full_scaled:
       preferred_normalization: log_cp10k_scaled
-arguments:
-  - name: --n_hvg
-    type: integer
-    default: 2000
-    description: Number of highly variable genes to use.
 resources:
   - type: python_script
     path: script.py

diff --git a/src/methods/mnnpy/script.py b/src/methods/mnnpy/script.py
@@ -5,7 +5,6 @@
 par = {
     'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
     'output': 'output.h5ad',
-    'n_hvg': 2000,
 }
 meta = {
     'name': 'foo',
@@ -19,11 +18,6 @@
 del adata.layers['normalized']
 del adata.layers['counts']
 
-if par['n_hvg']:
-    print(f"Select top {par['n_hvg']} high variable genes", flush=True)
-    idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
-    adata = adata[:, idx].copy()
-
 print('Run mnn', flush=True)
 split = []
 batch_categories = adata.obs['batch'].cat.categories

diff --git a/src/methods/scalex/config.vsh.yaml b/src/methods/scalex/config.vsh.yaml
@@ -17,11 +17,6 @@ info:
     scalex_feature_unscaled:
     scanorama_feature_scaled:
       preferred_normalization: log_cp10k_scaled
-arguments:
-  - name: --n_hvg
-    type: integer
-    default: 2000
-    description: Number of highly variable genes to use.
 resources:
   - type: python_script
     path: script.py

diff --git a/src/methods/scalex/script.py b/src/methods/scalex/script.py
@@ -6,7 +6,6 @@
 par = {
     'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
     'output': 'output.h5ad',
-    'hvg': True,
 }
 meta = {
     'name' : 'foo',
@@ -27,12 +26,6 @@
     uns='uns'
 )
 
-
-if par['n_hvg']:
-    print(f"Select top {par['n_hvg']} high variable genes", flush=True)
-    idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
-    adata = adata[:, idx].copy()
-
 print('Run SCALEX', flush=True)
 adata = scalex.SCALEX(
     adata,

diff --git a/src/methods/scanorama/config.vsh.yaml b/src/methods/scanorama/config.vsh.yaml
@@ -3,24 +3,22 @@ name: scanorama
 label: Scanorama
 summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama
 description: |
-  "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane."
+  Scanorama enables batch-correction and integration of heterogeneous scRNA-seq datasets.
+  It is designed to be used in scRNA-seq pipelines downstream of noise-reduction methods,
+  including those for imputation and highly-variable gene filtering. The results from
+  Scanorama integration and batch correction can then be used as input to other tools
+  for scRNA-seq clustering, visualization, and analysis.
 references:
+  # Hie, B., Bryson, B. & Berger, B. Efficient integration of heterogeneous single-cell
+  # transcriptomes using Scanorama. Nat Biotechnol 37, 685–691 (2019).
+  # https://doi.org/10.1038/s41587-019-0113-3
   doi: 10.1038/s41587-019-0113-3
 links:
   repository: https://github.com/brianhie/scanorama
   documentation: https://github.com/brianhie/scanorama#readme
 info:
   method_types: [feature, embedding]
   preferred_normalization: log_cp10k
-  variants:
-    scanorama_full_unscaled:
-    scanorama_full_scaled:
-      preferred_normalization: log_cp10k_scaled
-arguments:
-  - name: --n_hvg
-    type: integer
-    default: 2000
-    description: Number of highly variable genes to use.
 resources:
   - type: python_script
     path: script.py

diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama/script.py
@@ -6,7 +6,6 @@
 par = {
     'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
     'output': 'output.h5ad',
-    'n_hvg': 2000,
 }
 meta = {
     'name': 'foo',
@@ -53,11 +52,6 @@ def merge_adata(*adata_list, **kwargs):
     uns='uns'
 )
 
-if par['n_hvg']:
-    print(f"Select top {par['n_hvg']} high variable genes", flush=True)
-    idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
-    adata = adata[:, idx].copy()
-
 print('Run scanorama', flush=True)
 split = []
 batch_categories = adata.obs['batch'].cat.categories