diff --git a/README.md b/README.md index 50c16e34..5f8c6e92 100644 --- a/README.md +++ b/README.md @@ -66,25 +66,25 @@ flowchart TB file_solution("Solution") comp_control_method[/"Control method"/] comp_method[/"Method"/] - comp_transformer[/"Transform"/] + comp_process_integration[/"Process integration"/] comp_metric[/"Metric"/] file_integrated("Integration") - file_integrated_full("Transformed integration") + file_integrated_processed("Processed integration output") file_score("Score") file_common_dataset---comp_process_dataset comp_process_dataset-->file_dataset comp_process_dataset-->file_solution file_dataset---comp_control_method file_dataset---comp_method - file_dataset---comp_transformer + file_dataset---comp_process_integration file_solution---comp_control_method file_solution---comp_metric comp_control_method-->file_integrated comp_method-->file_integrated - comp_transformer-->file_integrated_full + comp_process_integration-->file_integrated_processed comp_metric-->file_score - file_integrated---comp_transformer - file_integrated_full---comp_metric + file_integrated---comp_process_integration + file_integrated_processed---comp_metric ``` ## File format: Common Dataset @@ -276,9 +276,10 @@ Arguments: -## Component type: Transform +## Component type: Process integration -Check the output and transform to create additional output types +Process output from an integration method to the format expected by +metrics Arguments: @@ -286,8 +287,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_integrated` | `file` | An integrated AnnData dataset. | | `--input_dataset` | `file` | Unintegrated AnnData HDF5 file. | +| `--input_integrated` | `file` | An integrated AnnData dataset. | | `--expected_method_types` | `string` | NA. | | `--expected_method_types` | `string` | NA. | | `--expected_method_types` | `string` | NA. | @@ -356,12 +357,12 @@ Data structure: -## File format: Transformed integration +## File format: Processed integration output An integrated AnnData dataset with additional outputs. Example file: -`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad` Description: @@ -379,7 +380,7 @@ Format:
AnnData object - obsm: 'X_emb' + obsm: 'X_emb', 'clustering' obsp: 'connectivities', 'distances' layers: 'corrected_counts' uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors' @@ -393,6 +394,7 @@ Data structure: | Slot | Type | Description | |:---|:---|:---| | `obsm["X_emb"]` | `double` | (*Optional*) Embedding output - 2D coordinate matrix. | +| `obsm["clustering"]` | `integer` | Leiden clustering results at different resolutions. | | `obsp["connectivities"]` | `double` | Graph output - neighbor connectivities matrix. | | `obsp["distances"]` | `double` | Graph output - neighbor distances matrix. | | `layers["corrected_counts"]` | `double` | (*Optional*) Feature output - corrected counts. | diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 49d2fd93..2d712578 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -14,26 +14,35 @@ DATASET_DIR=resources_test/task_batch_integration mkdir -p $DATASET_DIR # process dataset -viash run src/data_processors/process_dataset/config.vsh.yaml -- \ +nextflow run . \ + -main-script target/nextflow/workflows/process_datasets/main.nf \ + -profile docker \ --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \ - --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \ - --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad" + --publish_dir "$DATASET_DIR/cxg_immune_cell_atlas" \ + --output_dataset dataset.h5ad \ + --output_solution solution.h5ad \ + --output_state state.yaml \ + -c common/nextflow_helpers/labels_ci.config \ # run one method viash run src/methods/combat/config.vsh.yaml -- \ --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad -# run transformer -viash run src/data_processors/transform/config.vsh.yaml -- \ - --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \ - --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ - --expected_method_types feature \ - --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad +# process integration +nextflow run . \ + -main-script target/nextflow/data_processors/process_integration/main.nf \ + -profile docker \ + --input_dataset "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \ + --input_integrated "$DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad" \ + --expected_method_types feature \ + --publish_dir "$DATASET_DIR/cxg_immune_cell_atlas" \ + --output integrated_processed.h5ad \ + -c common/nextflow_helpers/labels_ci.config \ # run one metric viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ - --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \ + --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_processed.h5ad \ --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \ --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index bc57056a..1b3c5317 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -8,7 +8,7 @@ info: A metric for evaluating batch integration methods. arguments: - name: --input_integrated - __merge__: file_integrated_full.yaml + __merge__: file_integrated_processed.yaml direction: input required: true - name: --input_solution diff --git a/src/api/comp_process_integration.yaml b/src/api/comp_process_integration.yaml new file mode 100644 index 00000000..944df29c --- /dev/null +++ b/src/api/comp_process_integration.yaml @@ -0,0 +1,45 @@ +namespace: data_processors +info: + type: process_integration + type_info: + label: Process integration + summary: Process output from an integration method to the format expected by metrics + description: | + This component will: + + - Perform transformations of the integration output + - Cluster the integrated data at different resolutions + +argument_groups: + - name: Inputs + arguments: + - name: "--input_dataset" + __merge__: /src/api/file_dataset.yaml + type: file + direction: input + required: true + - name: "--input_integrated" + __merge__: /src/api/file_integrated.yaml + type: file + direction: input + required: true + - name: --expected_method_types + type: string + direction: input + required: true + multiple: true + description: | + The expected output types of the batch integration method. + choices: [ feature, embedding, graph ] + - name: Outputs + arguments: + - name: "--output" + __merge__: file_integrated_processed.yaml + direction: output + required: true + +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml deleted file mode 100644 index b68a9c37..00000000 --- a/src/api/comp_transformer.yaml +++ /dev/null @@ -1,39 +0,0 @@ -namespace: data_processors -info: - type: transformer - type_info: - label: Transform - summary: Check the output and transform to create additional output types - description: | - This component will: - - - Assert whether the input dataset and integrated dataset have the same shape. - - Reorder the integrated dataset to match the input dataset if needed. - - Transform the corrected feature output to an embedding. - - Transform an embedding to a graph output. -arguments: - - name: --input_integrated - __merge__: file_integrated.yaml - direction: input - required: true - - name: --input_dataset - __merge__: file_dataset.yaml - direction: input - required: true - - name: --expected_method_types - type: string - direction: input - required: true - multiple: true - description: | - The expected output types of the batch integration method. - choices: [ feature, embedding, graph ] - - name: --output - __merge__: file_integrated_full.yaml - direction: output - required: true -test_resources: - - type: python_script - path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas - dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/file_integrated_full.yaml b/src/api/file_integrated_processed.yaml similarity index 88% rename from src/api/file_integrated_full.yaml rename to src/api/file_integrated_processed.yaml index cdedb854..874182e8 100644 --- a/src/api/file_integrated_full.yaml +++ b/src/api/file_integrated_processed.yaml @@ -1,6 +1,6 @@ type: file -example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad" -label: Transformed integration +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad" +label: Processed integration output summary: An integrated AnnData dataset with additional outputs. description: | Must contain at least one of: @@ -23,6 +23,10 @@ info: name: X_emb description: Embedding output - 2D coordinate matrix required: false + - type: integer + name: clustering + description: Leiden clustering results at different resolutions. + required: true obsp: - type: double name: connectivities diff --git a/src/data_processors/precompute_clustering_merge/config.vsh.yaml b/src/data_processors/precompute_clustering_merge/config.vsh.yaml new file mode 100644 index 00000000..1ec001da --- /dev/null +++ b/src/data_processors/precompute_clustering_merge/config.vsh.yaml @@ -0,0 +1,30 @@ +name: precompute_clustering_merge +namespace: data_processors +label: Merge clustering precomputations +summary: Merge the precompute results of clustering on the input dataset +arguments: + - name: --input + type: file + direction: input + required: true + - name: --output + type: file + direction: output + required: true + - name: --clusterings + type: file + description: Clustering results to merge + direction: input + required: true + multiple: true +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/data_processors/precompute_clustering_merge/script.py b/src/data_processors/precompute_clustering_merge/script.py new file mode 100644 index 00000000..e008b964 --- /dev/null +++ b/src/data_processors/precompute_clustering_merge/script.py @@ -0,0 +1,28 @@ +import anndata as ad +import pandas as pd + +## VIASH START +par = { + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + "clusterings": ["output.h5ad", "output2.h5ad"], + "output": "output3.h5ad", +} +## VIASH END + +print("Read clusterings", flush=True) +clusterings = [] +for clus_file in par["clusterings"]: + adata = ad.read_h5ad(clus_file) + obs_filt = adata.obs.filter(regex='leiden_[0-9.]+') + clusterings.append(obs_filt) + +print("Merge clusterings", flush=True) +merged = pd.concat(clusterings, axis=1) + +print("Read input", flush=True) +input = ad.read_h5ad(par["input"]) + +input.obsm["clustering"] = merged + +print("Store outputs", flush=True) +input.write_h5ad(par["output"], compression="gzip") diff --git a/src/data_processors/precompute_clustering_run/config.vsh.yaml b/src/data_processors/precompute_clustering_run/config.vsh.yaml new file mode 100644 index 00000000..fe16036a --- /dev/null +++ b/src/data_processors/precompute_clustering_run/config.vsh.yaml @@ -0,0 +1,35 @@ +name: precompute_clustering_run +namespace: data_processors +label: Run clustering precomputations +summary: Run clustering on the input dataset +arguments: + - name: --input + __merge__: /src/api/file_common_dataset.yaml + direction: input + required: true + - name: --output + __merge__: /src/api/file_dataset.yaml + direction: output + required: true + - type: double + name: resolution + default: 0.8 + description: Resolution parameter for clustering +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scanpy + - igraph + - leidenalg +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/data_processors/precompute_clustering_run/script.py b/src/data_processors/precompute_clustering_run/script.py new file mode 100644 index 00000000..4552ce01 --- /dev/null +++ b/src/data_processors/precompute_clustering_run/script.py @@ -0,0 +1,50 @@ +import sys +import anndata as ad + +# check if we can use GPU +USE_GPU = False +try: + import subprocess + assert subprocess.run('nvidia-smi', shell=True, stdout=subprocess.DEVNULL).returncode == 0 + from rapids_singlecell.tl import leiden + USE_GPU = True +except Exception as e: + from scanpy.tl import leiden + +## VIASH START +par = { + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + "output": "output.h5ad", + "resolution": 0.8, +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +n_cell_cpu = 300_000 + +print("Read input", flush=True) +input = read_anndata(par["input"], obs='obs', obsp='obsp', uns='uns') + +key = f'leiden_{par["resolution"]}' +kwargs = dict() +if not USE_GPU: + kwargs |= dict( + flavor='igraph', + n_iterations=2, + ) + +print(f"Run Leiden clustering with {kwargs}", flush=True) +leiden( + input, + resolution=par["resolution"], + key_added=key, + **kwargs, +) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=input.obs[[key]], +) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index d91937e7..71e680a3 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -12,8 +12,8 @@ "output": "output.h5ad" } meta = { - "config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml", - "resources_dir": "src/common/helper_functions" + "config": "target/nextflow/data_processors/process_dataset/.config.vsh.yaml", + "resources_dir": "target/nextflow/data_processors/process_dataset" } ## VIASH END @@ -80,6 +80,12 @@ def compute_batched_hvg(adata, n_hvgs): "variance_ratio": variance_ratio } +print(">> Recompute neighbors", flush=True) +del adata.uns["knn"] +del adata.obsp["knn_connectivities"] +del adata.obsp["knn_distances"] +sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=30, key_added="knn") + print(">> Create output object", flush=True) output_dataset = subset_h5ad_by_format( adata, diff --git a/src/data_processors/process_integration/config.vsh.yaml b/src/data_processors/process_integration/config.vsh.yaml new file mode 100644 index 00000000..8e9b2e1c --- /dev/null +++ b/src/data_processors/process_integration/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: /src/api/comp_process_integration.yaml + +name: process_integration + +argument_groups: + - name: Clustering + arguments: + - name: "--resolutions" + type: double + multiple: true + default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + description: Resolution parameter for clustering + +resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +dependencies: + - name: data_processors/transform + - name: data_processors/precompute_clustering_run + - name: data_processors/precompute_clustering_merge + +runners: + - type: nextflow diff --git a/src/data_processors/process_integration/main.nf b/src/data_processors/process_integration/main.nf new file mode 100644 index 00000000..decbe484 --- /dev/null +++ b/src/data_processors/process_integration/main.nf @@ -0,0 +1,80 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + // transform output formats + | transform.run( + fromState: [ + input_integrated: "input_integrated", + input_dataset: "input_dataset", + expected_method_types: "expected_method_types" + ], + toState: { id, output, state -> + def new_state = state + [ + method_output_cleaned: output.output, + ] + + new_state + } + ) + + // collect clustering resolutions + | flatMap { id, state -> + state.resolutions.collect { resolution -> + def newId = "${id}_r${resolution}" + def newState = state + [ + "resolution": resolution, + "prevId": id + ] + [newId, newState] + } + } + + // precompute clustering at one resolution + | precompute_clustering_run.run( + fromState: [ + input: "method_output_cleaned", + resolution: "resolution" + ], + toState: ["output_clustering": "output"] + ) + + // group by original dataset id + | map{id, state -> + [state.prevId, state] + } + | groupTuple() + + // merge the clustering results into one state + | map{ id, states -> + if (states.size() == 0) { + throw new RuntimeException("Expected at least one state, but got ${states.size()}") + } + if (states.size() != states[0].resolutions.size()) { + throw new RuntimeException("Expected ${states[0].resolutions.size()} states, but got ${states.size()}") + } + + def clusterings = states.collect { it.output_clustering } + def newState = states[0] + ["clusterings": clusterings] + + [id, newState] + } + + // merge clustering results into dataset h5ad + | precompute_clustering_merge.run( + fromState: [ + input: "method_output_cleaned", + clusterings: "clusterings" + ], + toState: [output : "output"] + ) + + // only output what is defined in config + | setState(["output"]) + + emit: + output_ch +} diff --git a/src/data_processors/transform/config.vsh.yaml b/src/data_processors/transform/config.vsh.yaml index 863207af..7dbe0a19 100644 --- a/src/data_processors/transform/config.vsh.yaml +++ b/src/data_processors/transform/config.vsh.yaml @@ -1,10 +1,10 @@ -__merge__: /src/api/comp_transformer.yaml name: transform +namespace: data_processors label: Transform summary: Check the output and transform to create additional output types description: | This component will: - + - Assert whether the input dataset and integrated dataset have the same shape. - Reorder the integrated dataset to match the input dataset if needed. - Transform the corrected feature output to an embedding. @@ -13,6 +13,27 @@ info: test_setup: default: expected_method_types: feature +arguments: + - name: --input_integrated + type: file + direction: input + required: true + - name: --input_dataset + type: file + direction: input + required: true + - name: --expected_method_types + type: string + direction: input + required: true + multiple: true + description: | + The expected output types of the batch integration method. + choices: [ feature, embedding, graph ] + - name: --output + type: file + direction: output + required: true resources: - type: python_script path: script.py @@ -25,5 +46,5 @@ engines: runners: - type: executable - type: nextflow - directives: + directives: label: [midtime, midmem, lowcpu] diff --git a/src/metrics/clustering_overlap/config.vsh.yaml b/src/metrics/clustering_overlap/config.vsh.yaml index bc6d3fa8..11b0cbd8 100644 --- a/src/metrics/clustering_overlap/config.vsh.yaml +++ b/src/metrics/clustering_overlap/config.vsh.yaml @@ -49,6 +49,12 @@ info: min: 0 max: 1 maximize: true +arguments: + - name: --resolutions + type: double + multiple: true + default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + description: Resolution parameter for clustering, looking for precomputed clusters of that resolution resources: - type: python_script path: script.py @@ -59,7 +65,7 @@ engines: setup: - type: python pypi: - - scib==1.1.5 + - scib==1.1.6 runners: - type: executable - type: nextflow diff --git a/src/metrics/clustering_overlap/script.py b/src/metrics/clustering_overlap/script.py index 2254acb0..b2951896 100644 --- a/src/metrics/clustering_overlap/script.py +++ b/src/metrics/clustering_overlap/script.py @@ -1,4 +1,5 @@ import sys +import pandas as pd import anndata as ad import scanpy as sc from scib.metrics.clustering import cluster_optimal_resolution @@ -8,6 +9,7 @@ par = { 'adata_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', + "resolutions": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], } meta = { @@ -20,23 +22,30 @@ print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', obsp='obsp', uns='uns') adata.obs = read_anndata(par['input_solution'], obs='obs').obs adata.uns |= read_anndata(par['input_solution'], uns='uns').uns print('Run optimal Leiden clustering', flush=True) +cluster_key = "leiden" + +# get existing clusters +cluster_df = adata.obsm.get('clustering', pd.DataFrame(index=adata.obs_names)) +adata.obs = pd.concat([adata.obs, cluster_df], axis=1) + cluster_optimal_resolution( adata=adata, label_key="cell_type", - cluster_key='cluster', + cluster_key=cluster_key, cluster_function=sc.tl.leiden, + resolutions=par["resolutions"], ) print('Compute ARI score', flush=True) -ari_score = ari(adata, cluster_key='cluster', label_key="cell_type") +ari_score = ari(adata, cluster_key=cluster_key, label_key="cell_type") print('Compute NMI score', flush=True) -nmi_score = nmi(adata, cluster_key='cluster', label_key="cell_type") +nmi_score = nmi(adata, cluster_key=cluster_key, label_key="cell_type") print("Create output AnnData object", flush=True) output = ad.AnnData( diff --git a/src/metrics/isolated_label_f1/config.vsh.yaml b/src/metrics/isolated_label_f1/config.vsh.yaml index 27fc9f49..f4241f8f 100644 --- a/src/metrics/isolated_label_f1/config.vsh.yaml +++ b/src/metrics/isolated_label_f1/config.vsh.yaml @@ -37,6 +37,12 @@ info: min: 0 max: 1 maximize: true +arguments: + - name: --resolutions + type: double + multiple: true + default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + description: Resolution parameter for clustering, looking for precomputed clusters of that resolution resources: - type: python_script path: script.py @@ -47,7 +53,7 @@ engines: setup: - type: python pypi: - - scib==1.1.5 + - scib==1.1.6 runners: - type: executable - type: nextflow diff --git a/src/metrics/isolated_label_f1/script.py b/src/metrics/isolated_label_f1/script.py index 2737f244..08d1c431 100644 --- a/src/metrics/isolated_label_f1/script.py +++ b/src/metrics/isolated_label_f1/script.py @@ -1,4 +1,5 @@ import sys +import pandas as pd import anndata as ad from scib.metrics import isolated_labels_f1 @@ -6,6 +7,7 @@ par = { 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', + "resolutions": [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8], } meta = { @@ -18,15 +20,21 @@ print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') +adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', obsp='obsp', uns='uns') adata.obs = read_anndata(par['input_solution'], obs='obs').obs adata.uns |= read_anndata(par['input_solution'], uns='uns').uns +# get existing clusters +cluster_df = adata.obsm.get('clustering', pd.DataFrame(index=adata.obs_names)) +adata.obs = pd.concat([adata.obs, cluster_df], axis=1) + print('compute score', flush=True) score = isolated_labels_f1( adata, label_key="cell_type", - batch_key='batch', + batch_key="batch", + cluster_key="leiden", + resolutions=par["resolutions"], embed=None, iso_threshold=None, verbose=True, diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index fc2b142c..6e4a4020 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -38,6 +38,7 @@ workflow run_wf { state.dataset != null } + // process the dataset | process_dataset.run( fromState: [ input: "dataset" ], toState: [ diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 75c93003..269f5337 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -42,6 +42,13 @@ argument_groups: required: true direction: output default: task_info.yaml + - name: Clustering + arguments: + - name: "--resolutions" + type: double + multiple: true + default: [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] + description: Resolution parameter for clustering - name: Method filtering description: | Use these arguments to filter methods by name. By default, all methods are @@ -111,7 +118,8 @@ dependencies: - name: metrics/kbet - name: metrics/lisi - name: metrics/pcr - - name: data_processors/transform + # data processors + - name: data_processors/process_integration runners: - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index afcb968c..035c42c5 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -80,9 +80,9 @@ workflow run_wf { } ) - /*************************** - * RUN METHODS AND METRICS * - ***************************/ + /*************** + * RUN METHODS * + ***************/ score_ch = dataset_ch @@ -135,13 +135,19 @@ workflow run_wf { } ) - | transform.run( + /****************** + * PROCESS OUTPUT * + ******************/ + + | process_integration.run( fromState: [ input_integrated: "method_output", input_dataset: "input_dataset", expected_method_types: "method_types" ], toState: { id, output, state -> + // Add method types to the state + // This is done here because state can't be passed from the processing subworkflow def method_types_cleaned = [] if ("feature" in state.method_types) { method_types_cleaned += ["feature", "embedding", "graph"] @@ -152,7 +158,7 @@ workflow run_wf { } def new_state = state + [ - method_output_cleaned: output.output, + method_output_processed: output.output, method_types_cleaned: method_types_cleaned ] @@ -160,6 +166,10 @@ workflow run_wf { } ) + /*************** + * RUN METRICS * + ***************/ + // run all metrics | runEach( components: metrics, @@ -172,7 +182,7 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ input_solution: "input_solution", - input_integrated: "method_output_cleaned" + input_integrated: "method_output_processed" ], // use 'toState' to publish that component's outputs to the overall state toState: { id, output, state, comp ->