diff --git a/README.md b/README.md
index 50c16e34..5f8c6e92 100644
--- a/README.md
+++ b/README.md
@@ -66,25 +66,25 @@ flowchart TB
file_solution("Solution")
comp_control_method[/"Control method"/]
comp_method[/"Method"/]
- comp_transformer[/"Transform"/]
+ comp_process_integration[/"Process integration"/]
comp_metric[/"Metric"/]
file_integrated("Integration")
- file_integrated_full("Transformed integration")
+ file_integrated_processed("Processed integration output")
file_score("Score")
file_common_dataset---comp_process_dataset
comp_process_dataset-->file_dataset
comp_process_dataset-->file_solution
file_dataset---comp_control_method
file_dataset---comp_method
- file_dataset---comp_transformer
+ file_dataset---comp_process_integration
file_solution---comp_control_method
file_solution---comp_metric
comp_control_method-->file_integrated
comp_method-->file_integrated
- comp_transformer-->file_integrated_full
+ comp_process_integration-->file_integrated_processed
comp_metric-->file_score
- file_integrated---comp_transformer
- file_integrated_full---comp_metric
+ file_integrated---comp_process_integration
+ file_integrated_processed---comp_metric
```
## File format: Common Dataset
@@ -276,9 +276,10 @@ Arguments:
-## Component type: Transform
+## Component type: Process integration
-Check the output and transform to create additional output types
+Process output from an integration method to the format expected by
+metrics
Arguments:
@@ -286,8 +287,8 @@ Arguments:
| Name | Type | Description |
|:---|:---|:---|
-| `--input_integrated` | `file` | An integrated AnnData dataset. |
| `--input_dataset` | `file` | Unintegrated AnnData HDF5 file. |
+| `--input_integrated` | `file` | An integrated AnnData dataset. |
| `--expected_method_types` | `string` | NA. |
| `--expected_method_types` | `string` | NA. |
| `--expected_method_types` | `string` | NA. |
@@ -356,12 +357,12 @@ Data structure:
-## File format: Transformed integration
+## File format: Processed integration output
An integrated AnnData dataset with additional outputs.
Example file:
-`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad`
+`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad`
Description:
@@ -379,7 +380,7 @@ Format:
AnnData object
- obsm: 'X_emb'
+ obsm: 'X_emb', 'clustering'
obsp: 'connectivities', 'distances'
layers: 'corrected_counts'
uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors'
@@ -393,6 +394,7 @@ Data structure:
| Slot | Type | Description |
|:---|:---|:---|
| `obsm["X_emb"]` | `double` | (*Optional*) Embedding output - 2D coordinate matrix. |
+| `obsm["clustering"]` | `integer` | Leiden clustering results at different resolutions. |
| `obsp["connectivities"]` | `double` | Graph output - neighbor connectivities matrix. |
| `obsp["distances"]` | `double` | Graph output - neighbor distances matrix. |
| `layers["corrected_counts"]` | `double` | (*Optional*) Feature output - corrected counts. |
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 49d2fd93..2d712578 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -14,26 +14,35 @@ DATASET_DIR=resources_test/task_batch_integration
mkdir -p $DATASET_DIR
# process dataset
-viash run src/data_processors/process_dataset/config.vsh.yaml -- \
+nextflow run . \
+ -main-script target/nextflow/workflows/process_datasets/main.nf \
+ -profile docker \
--input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \
- --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \
- --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad"
+ --publish_dir "$DATASET_DIR/cxg_immune_cell_atlas" \
+ --output_dataset dataset.h5ad \
+ --output_solution solution.h5ad \
+ --output_state state.yaml \
+ -c common/nextflow_helpers/labels_ci.config \
# run one method
viash run src/methods/combat/config.vsh.yaml -- \
--input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
--output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad
-# run transformer
-viash run src/data_processors/transform/config.vsh.yaml -- \
- --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \
- --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
- --expected_method_types feature \
- --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad
+# process integration
+nextflow run . \
+ -main-script target/nextflow/data_processors/process_integration/main.nf \
+ -profile docker \
+ --input_dataset "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \
+ --input_integrated "$DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad" \
+ --expected_method_types feature \
+ --publish_dir "$DATASET_DIR/cxg_immune_cell_atlas" \
+ --output integrated_processed.h5ad \
+ -c common/nextflow_helpers/labels_ci.config \
# run one metric
viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
- --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \
+ --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_processed.h5ad \
--input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \
--output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index bc57056a..1b3c5317 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -8,7 +8,7 @@ info:
A metric for evaluating batch integration methods.
arguments:
- name: --input_integrated
- __merge__: file_integrated_full.yaml
+ __merge__: file_integrated_processed.yaml
direction: input
required: true
- name: --input_solution
diff --git a/src/api/comp_process_integration.yaml b/src/api/comp_process_integration.yaml
new file mode 100644
index 00000000..944df29c
--- /dev/null
+++ b/src/api/comp_process_integration.yaml
@@ -0,0 +1,45 @@
+namespace: data_processors
+info:
+ type: process_integration
+ type_info:
+ label: Process integration
+ summary: Process output from an integration method to the format expected by metrics
+ description: |
+ This component will:
+
+ - Perform transformations of the integration output
+ - Cluster the integrated data at different resolutions
+
+argument_groups:
+ - name: Inputs
+ arguments:
+ - name: "--input_dataset"
+ __merge__: /src/api/file_dataset.yaml
+ type: file
+ direction: input
+ required: true
+ - name: "--input_integrated"
+ __merge__: /src/api/file_integrated.yaml
+ type: file
+ direction: input
+ required: true
+ - name: --expected_method_types
+ type: string
+ direction: input
+ required: true
+ multiple: true
+ description: |
+ The expected output types of the batch integration method.
+ choices: [ feature, embedding, graph ]
+ - name: Outputs
+ arguments:
+ - name: "--output"
+ __merge__: file_integrated_processed.yaml
+ direction: output
+ required: true
+
+test_resources:
+ - type: python_script
+ path: /common/component_tests/run_and_check_output.py
+ - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+ dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml
deleted file mode 100644
index b68a9c37..00000000
--- a/src/api/comp_transformer.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
-namespace: data_processors
-info:
- type: transformer
- type_info:
- label: Transform
- summary: Check the output and transform to create additional output types
- description: |
- This component will:
-
- - Assert whether the input dataset and integrated dataset have the same shape.
- - Reorder the integrated dataset to match the input dataset if needed.
- - Transform the corrected feature output to an embedding.
- - Transform an embedding to a graph output.
-arguments:
- - name: --input_integrated
- __merge__: file_integrated.yaml
- direction: input
- required: true
- - name: --input_dataset
- __merge__: file_dataset.yaml
- direction: input
- required: true
- - name: --expected_method_types
- type: string
- direction: input
- required: true
- multiple: true
- description: |
- The expected output types of the batch integration method.
- choices: [ feature, embedding, graph ]
- - name: --output
- __merge__: file_integrated_full.yaml
- direction: output
- required: true
-test_resources:
- - type: python_script
- path: /common/component_tests/run_and_check_output.py
- - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
- dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/file_integrated_full.yaml b/src/api/file_integrated_processed.yaml
similarity index 88%
rename from src/api/file_integrated_full.yaml
rename to src/api/file_integrated_processed.yaml
index cdedb854..874182e8 100644
--- a/src/api/file_integrated_full.yaml
+++ b/src/api/file_integrated_processed.yaml
@@ -1,6 +1,6 @@
type: file
-example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad"
-label: Transformed integration
+example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad"
+label: Processed integration output
summary: An integrated AnnData dataset with additional outputs.
description: |
Must contain at least one of:
@@ -23,6 +23,10 @@ info:
name: X_emb
description: Embedding output - 2D coordinate matrix
required: false
+ - type: integer
+ name: clustering
+ description: Leiden clustering results at different resolutions.
+ required: true
obsp:
- type: double
name: connectivities
diff --git a/src/data_processors/precompute_clustering_merge/config.vsh.yaml b/src/data_processors/precompute_clustering_merge/config.vsh.yaml
new file mode 100644
index 00000000..1ec001da
--- /dev/null
+++ b/src/data_processors/precompute_clustering_merge/config.vsh.yaml
@@ -0,0 +1,30 @@
+name: precompute_clustering_merge
+namespace: data_processors
+label: Merge clustering precomputations
+summary: Merge the precompute results of clustering on the input dataset
+arguments:
+ - name: --input
+ type: file
+ direction: input
+ required: true
+ - name: --output
+ type: file
+ direction: output
+ required: true
+ - name: --clusterings
+ type: file
+ description: Clustering results to merge
+ direction: input
+ required: true
+ multiple: true
+resources:
+ - type: python_script
+ path: script.py
+engines:
+ - type: docker
+ image: openproblems/base_python:1.0.0
+runners:
+ - type: executable
+ - type: nextflow
+ directives:
+ label: [midtime, midmem, lowcpu]
diff --git a/src/data_processors/precompute_clustering_merge/script.py b/src/data_processors/precompute_clustering_merge/script.py
new file mode 100644
index 00000000..e008b964
--- /dev/null
+++ b/src/data_processors/precompute_clustering_merge/script.py
@@ -0,0 +1,28 @@
+import anndata as ad
+import pandas as pd
+
+## VIASH START
+par = {
+ "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+ "clusterings": ["output.h5ad", "output2.h5ad"],
+ "output": "output3.h5ad",
+}
+## VIASH END
+
+print("Read clusterings", flush=True)
+clusterings = []
+for clus_file in par["clusterings"]:
+ adata = ad.read_h5ad(clus_file)
+ obs_filt = adata.obs.filter(regex='leiden_[0-9.]+')
+ clusterings.append(obs_filt)
+
+print("Merge clusterings", flush=True)
+merged = pd.concat(clusterings, axis=1)
+
+print("Read input", flush=True)
+input = ad.read_h5ad(par["input"])
+
+input.obsm["clustering"] = merged
+
+print("Store outputs", flush=True)
+input.write_h5ad(par["output"], compression="gzip")
diff --git a/src/data_processors/precompute_clustering_run/config.vsh.yaml b/src/data_processors/precompute_clustering_run/config.vsh.yaml
new file mode 100644
index 00000000..fe16036a
--- /dev/null
+++ b/src/data_processors/precompute_clustering_run/config.vsh.yaml
@@ -0,0 +1,35 @@
+name: precompute_clustering_run
+namespace: data_processors
+label: Run clustering precomputations
+summary: Run clustering on the input dataset
+arguments:
+ - name: --input
+ __merge__: /src/api/file_common_dataset.yaml
+ direction: input
+ required: true
+ - name: --output
+ __merge__: /src/api/file_dataset.yaml
+ direction: output
+ required: true
+ - type: double
+ name: resolution
+ default: 0.8
+ description: Resolution parameter for clustering
+resources:
+ - type: python_script
+ path: script.py
+ - path: /src/utils/read_anndata_partial.py
+engines:
+ - type: docker
+ image: openproblems/base_python:1.0.0
+ setup:
+ - type: python
+ pypi:
+ - scanpy
+ - igraph
+ - leidenalg
+runners:
+ - type: executable
+ - type: nextflow
+ directives:
+ label: [midtime, midmem, lowcpu]
diff --git a/src/data_processors/precompute_clustering_run/script.py b/src/data_processors/precompute_clustering_run/script.py
new file mode 100644
index 00000000..4552ce01
--- /dev/null
+++ b/src/data_processors/precompute_clustering_run/script.py
@@ -0,0 +1,50 @@
+import sys
+import anndata as ad
+
+# check if we can use GPU
+USE_GPU = False
+try:
+ import subprocess
+ assert subprocess.run('nvidia-smi', shell=True, stdout=subprocess.DEVNULL).returncode == 0
+ from rapids_singlecell.tl import leiden
+ USE_GPU = True
+except Exception as e:
+ from scanpy.tl import leiden
+
+## VIASH START
+par = {
+ "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+ "output": "output.h5ad",
+ "resolution": 0.8,
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+n_cell_cpu = 300_000
+
+print("Read input", flush=True)
+input = read_anndata(par["input"], obs='obs', obsp='obsp', uns='uns')
+
+key = f'leiden_{par["resolution"]}'
+kwargs = dict()
+if not USE_GPU:
+ kwargs |= dict(
+ flavor='igraph',
+ n_iterations=2,
+ )
+
+print(f"Run Leiden clustering with {kwargs}", flush=True)
+leiden(
+ input,
+ resolution=par["resolution"],
+ key_added=key,
+ **kwargs,
+)
+
+print("Store outputs", flush=True)
+output = ad.AnnData(
+ obs=input.obs[[key]],
+)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py
index d91937e7..71e680a3 100644
--- a/src/data_processors/process_dataset/script.py
+++ b/src/data_processors/process_dataset/script.py
@@ -12,8 +12,8 @@
"output": "output.h5ad"
}
meta = {
- "config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml",
- "resources_dir": "src/common/helper_functions"
+ "config": "target/nextflow/data_processors/process_dataset/.config.vsh.yaml",
+ "resources_dir": "target/nextflow/data_processors/process_dataset"
}
## VIASH END
@@ -80,6 +80,12 @@ def compute_batched_hvg(adata, n_hvgs):
"variance_ratio": variance_ratio
}
+print(">> Recompute neighbors", flush=True)
+del adata.uns["knn"]
+del adata.obsp["knn_connectivities"]
+del adata.obsp["knn_distances"]
+sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=30, key_added="knn")
+
print(">> Create output object", flush=True)
output_dataset = subset_h5ad_by_format(
adata,
diff --git a/src/data_processors/process_integration/config.vsh.yaml b/src/data_processors/process_integration/config.vsh.yaml
new file mode 100644
index 00000000..8e9b2e1c
--- /dev/null
+++ b/src/data_processors/process_integration/config.vsh.yaml
@@ -0,0 +1,25 @@
+__merge__: /src/api/comp_process_integration.yaml
+
+name: process_integration
+
+argument_groups:
+ - name: Clustering
+ arguments:
+ - name: "--resolutions"
+ type: double
+ multiple: true
+ default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+ description: Resolution parameter for clustering
+
+resources:
+ - type: nextflow_script
+ path: main.nf
+ entrypoint: run_wf
+
+dependencies:
+ - name: data_processors/transform
+ - name: data_processors/precompute_clustering_run
+ - name: data_processors/precompute_clustering_merge
+
+runners:
+ - type: nextflow
diff --git a/src/data_processors/process_integration/main.nf b/src/data_processors/process_integration/main.nf
new file mode 100644
index 00000000..decbe484
--- /dev/null
+++ b/src/data_processors/process_integration/main.nf
@@ -0,0 +1,80 @@
+workflow run_wf {
+ take:
+ input_ch
+
+ main:
+ output_ch = input_ch
+
+ // transform output formats
+ | transform.run(
+ fromState: [
+ input_integrated: "input_integrated",
+ input_dataset: "input_dataset",
+ expected_method_types: "expected_method_types"
+ ],
+ toState: { id, output, state ->
+ def new_state = state + [
+ method_output_cleaned: output.output,
+ ]
+
+ new_state
+ }
+ )
+
+ // collect clustering resolutions
+ | flatMap { id, state ->
+ state.resolutions.collect { resolution ->
+ def newId = "${id}_r${resolution}"
+ def newState = state + [
+ "resolution": resolution,
+ "prevId": id
+ ]
+ [newId, newState]
+ }
+ }
+
+ // precompute clustering at one resolution
+ | precompute_clustering_run.run(
+ fromState: [
+ input: "method_output_cleaned",
+ resolution: "resolution"
+ ],
+ toState: ["output_clustering": "output"]
+ )
+
+ // group by original dataset id
+ | map{id, state ->
+ [state.prevId, state]
+ }
+ | groupTuple()
+
+ // merge the clustering results into one state
+ | map{ id, states ->
+ if (states.size() == 0) {
+ throw new RuntimeException("Expected at least one state, but got ${states.size()}")
+ }
+ if (states.size() != states[0].resolutions.size()) {
+ throw new RuntimeException("Expected ${states[0].resolutions.size()} states, but got ${states.size()}")
+ }
+
+ def clusterings = states.collect { it.output_clustering }
+ def newState = states[0] + ["clusterings": clusterings]
+
+ [id, newState]
+ }
+
+ // merge clustering results into dataset h5ad
+ | precompute_clustering_merge.run(
+ fromState: [
+ input: "method_output_cleaned",
+ clusterings: "clusterings"
+ ],
+ toState: [output : "output"]
+ )
+
+ // only output what is defined in config
+ | setState(["output"])
+
+ emit:
+ output_ch
+}
diff --git a/src/data_processors/transform/config.vsh.yaml b/src/data_processors/transform/config.vsh.yaml
index 863207af..7dbe0a19 100644
--- a/src/data_processors/transform/config.vsh.yaml
+++ b/src/data_processors/transform/config.vsh.yaml
@@ -1,10 +1,10 @@
-__merge__: /src/api/comp_transformer.yaml
name: transform
+namespace: data_processors
label: Transform
summary: Check the output and transform to create additional output types
description: |
This component will:
-
+
- Assert whether the input dataset and integrated dataset have the same shape.
- Reorder the integrated dataset to match the input dataset if needed.
- Transform the corrected feature output to an embedding.
@@ -13,6 +13,27 @@ info:
test_setup:
default:
expected_method_types: feature
+arguments:
+ - name: --input_integrated
+ type: file
+ direction: input
+ required: true
+ - name: --input_dataset
+ type: file
+ direction: input
+ required: true
+ - name: --expected_method_types
+ type: string
+ direction: input
+ required: true
+ multiple: true
+ description: |
+ The expected output types of the batch integration method.
+ choices: [ feature, embedding, graph ]
+ - name: --output
+ type: file
+ direction: output
+ required: true
resources:
- type: python_script
path: script.py
@@ -25,5 +46,5 @@ engines:
runners:
- type: executable
- type: nextflow
- directives:
+ directives:
label: [midtime, midmem, lowcpu]
diff --git a/src/metrics/clustering_overlap/config.vsh.yaml b/src/metrics/clustering_overlap/config.vsh.yaml
index bc6d3fa8..11b0cbd8 100644
--- a/src/metrics/clustering_overlap/config.vsh.yaml
+++ b/src/metrics/clustering_overlap/config.vsh.yaml
@@ -49,6 +49,12 @@ info:
min: 0
max: 1
maximize: true
+arguments:
+ - name: --resolutions
+ type: double
+ multiple: true
+ default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+ description: Resolution parameter for clustering, looking for precomputed clusters of that resolution
resources:
- type: python_script
path: script.py
@@ -59,7 +65,7 @@ engines:
setup:
- type: python
pypi:
- - scib==1.1.5
+ - scib==1.1.6
runners:
- type: executable
- type: nextflow
diff --git a/src/metrics/clustering_overlap/script.py b/src/metrics/clustering_overlap/script.py
index 2254acb0..b2951896 100644
--- a/src/metrics/clustering_overlap/script.py
+++ b/src/metrics/clustering_overlap/script.py
@@ -1,4 +1,5 @@
import sys
+import pandas as pd
import anndata as ad
import scanpy as sc
from scib.metrics.clustering import cluster_optimal_resolution
@@ -8,6 +9,7 @@
par = {
'adata_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
'output': 'output.h5ad',
+ "resolutions": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
}
meta = {
@@ -20,23 +22,30 @@
print('Read input', flush=True)
-adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns')
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', obsp='obsp', uns='uns')
adata.obs = read_anndata(par['input_solution'], obs='obs').obs
adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
print('Run optimal Leiden clustering', flush=True)
+cluster_key = "leiden"
+
+# get existing clusters
+cluster_df = adata.obsm.get('clustering', pd.DataFrame(index=adata.obs_names))
+adata.obs = pd.concat([adata.obs, cluster_df], axis=1)
+
cluster_optimal_resolution(
adata=adata,
label_key="cell_type",
- cluster_key='cluster',
+ cluster_key=cluster_key,
cluster_function=sc.tl.leiden,
+ resolutions=par["resolutions"],
)
print('Compute ARI score', flush=True)
-ari_score = ari(adata, cluster_key='cluster', label_key="cell_type")
+ari_score = ari(adata, cluster_key=cluster_key, label_key="cell_type")
print('Compute NMI score', flush=True)
-nmi_score = nmi(adata, cluster_key='cluster', label_key="cell_type")
+nmi_score = nmi(adata, cluster_key=cluster_key, label_key="cell_type")
print("Create output AnnData object", flush=True)
output = ad.AnnData(
diff --git a/src/metrics/isolated_label_f1/config.vsh.yaml b/src/metrics/isolated_label_f1/config.vsh.yaml
index 27fc9f49..f4241f8f 100644
--- a/src/metrics/isolated_label_f1/config.vsh.yaml
+++ b/src/metrics/isolated_label_f1/config.vsh.yaml
@@ -37,6 +37,12 @@ info:
min: 0
max: 1
maximize: true
+arguments:
+ - name: --resolutions
+ type: double
+ multiple: true
+ default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+ description: Resolution parameter for clustering, looking for precomputed clusters of that resolution
resources:
- type: python_script
path: script.py
@@ -47,7 +53,7 @@ engines:
setup:
- type: python
pypi:
- - scib==1.1.5
+ - scib==1.1.6
runners:
- type: executable
- type: nextflow
diff --git a/src/metrics/isolated_label_f1/script.py b/src/metrics/isolated_label_f1/script.py
index 2737f244..08d1c431 100644
--- a/src/metrics/isolated_label_f1/script.py
+++ b/src/metrics/isolated_label_f1/script.py
@@ -1,4 +1,5 @@
import sys
+import pandas as pd
import anndata as ad
from scib.metrics import isolated_labels_f1
@@ -6,6 +7,7 @@
par = {
'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad',
'output': 'output.h5ad',
+ "resolutions": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
}
meta = {
@@ -18,15 +20,21 @@
print('Read input', flush=True)
-adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns')
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', obsp='obsp', uns='uns')
adata.obs = read_anndata(par['input_solution'], obs='obs').obs
adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+# get existing clusters
+cluster_df = adata.obsm.get('clustering', pd.DataFrame(index=adata.obs_names))
+adata.obs = pd.concat([adata.obs, cluster_df], axis=1)
+
print('compute score', flush=True)
score = isolated_labels_f1(
adata,
label_key="cell_type",
- batch_key='batch',
+ batch_key="batch",
+ cluster_key="leiden",
+ resolutions=par["resolutions"],
embed=None,
iso_threshold=None,
verbose=True,
diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf
index fc2b142c..6e4a4020 100644
--- a/src/workflows/process_datasets/main.nf
+++ b/src/workflows/process_datasets/main.nf
@@ -38,6 +38,7 @@ workflow run_wf {
state.dataset != null
}
+ // process the dataset
| process_dataset.run(
fromState: [ input: "dataset" ],
toState: [
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
index 75c93003..269f5337 100644
--- a/src/workflows/run_benchmark/config.vsh.yaml
+++ b/src/workflows/run_benchmark/config.vsh.yaml
@@ -42,6 +42,13 @@ argument_groups:
required: true
direction: output
default: task_info.yaml
+ - name: Clustering
+ arguments:
+ - name: "--resolutions"
+ type: double
+ multiple: true
+ default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
+ description: Resolution parameter for clustering
- name: Method filtering
description: |
Use these arguments to filter methods by name. By default, all methods are
@@ -111,7 +118,8 @@ dependencies:
- name: metrics/kbet
- name: metrics/lisi
- name: metrics/pcr
- - name: data_processors/transform
+ # data processors
+ - name: data_processors/process_integration
runners:
- type: nextflow
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
index afcb968c..035c42c5 100644
--- a/src/workflows/run_benchmark/main.nf
+++ b/src/workflows/run_benchmark/main.nf
@@ -80,9 +80,9 @@ workflow run_wf {
}
)
- /***************************
- * RUN METHODS AND METRICS *
- ***************************/
+ /***************
+ * RUN METHODS *
+ ***************/
score_ch = dataset_ch
@@ -135,13 +135,19 @@ workflow run_wf {
}
)
- | transform.run(
+ /******************
+ * PROCESS OUTPUT *
+ ******************/
+
+ | process_integration.run(
fromState: [
input_integrated: "method_output",
input_dataset: "input_dataset",
expected_method_types: "method_types"
],
toState: { id, output, state ->
+ // Add method types to the state
+ // This is done here because state can't be passed from the processing subworkflow
def method_types_cleaned = []
if ("feature" in state.method_types) {
method_types_cleaned += ["feature", "embedding", "graph"]
@@ -152,7 +158,7 @@ workflow run_wf {
}
def new_state = state + [
- method_output_cleaned: output.output,
+ method_output_processed: output.output,
method_types_cleaned: method_types_cleaned
]
@@ -160,6 +166,10 @@ workflow run_wf {
}
)
+ /***************
+ * RUN METRICS *
+ ***************/
+
// run all metrics
| runEach(
components: metrics,
@@ -172,7 +182,7 @@ workflow run_wf {
// use 'fromState' to fetch the arguments the component requires from the overall state
fromState: [
input_solution: "input_solution",
- input_integrated: "method_output_cleaned"
+ input_integrated: "method_output_processed"
],
// use 'toState' to publish that component's outputs to the overall state
toState: { id, output, state, comp ->