diff --git a/README.md b/README.md index 3a460e2b..50c16e34 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,7 @@ flowchart TB A subset of the common dataset. -Example file: -`resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad` +Example file: `resources_test/common/cxg_immune_cell_atlas/dataset.h5ad` Format: @@ -158,7 +157,7 @@ Arguments: Unintegrated AnnData HDF5 file. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad` Format: @@ -202,7 +201,7 @@ Data structure: Uncensored dataset containing the true labels. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad` Format: @@ -317,7 +316,7 @@ Arguments: An integrated AnnData dataset. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad` Description: @@ -362,7 +361,7 @@ Data structure: An integrated AnnData dataset with additional outputs. Example file: -`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad` +`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad` Description: diff --git a/_viash.yaml b/_viash.yaml index 8a0d18ea..1598a220 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -31,21 +31,21 @@ description: | references: doi: - # Luecken, M.D., Büttner, M., Chaichoompu, K. et al. - # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022). + # Luecken, M.D., Büttner, M., Chaichoompu, K. et al. + # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022). - 10.1038/s41592-021-01336-8 - + info: image: thumbnail.svg test_resources: - type: s3 - path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/ - dest: resources_test/common/cxg_mouse_pancreas_atlas + path: s3://openproblems-data/resources_test/common/cxg_immune_cell_atlas/ + dest: resources_test/common/cxg_immune_cell_atlas - type: s3 path: s3://openproblems-data/resources_test/task_batch_integration/ dest: resources_test/task_batch_integration -authors: +authors: - name: Michaela Mueller roles: [ maintainer, author ] info: diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 58ac28a1..66b4eefb 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -19,7 +19,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \ --pull-latest \ --main-script target/nextflow/workflows/process_datasets/main.nf \ --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --compute-env 6UWsS5iw7TI37saKo2wcMi \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 92694692..49d2fd93 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -15,36 +15,36 @@ mkdir -p $DATASET_DIR # process dataset viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input "$RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad" \ - --output_dataset "$DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad" \ - --output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad" + --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \ + --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \ + --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad" # run one method viash run src/methods/combat/config.vsh.yaml -- \ - --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad + --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ + --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad # run transformer viash run src/data_processors/transform/config.vsh.yaml -- \ - --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \ - --input_dataset $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \ + --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \ + --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \ --expected_method_types feature \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad + --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad # run one metric viash run src/metrics/graph_connectivity/config.vsh.yaml -- \ - --input_integrated $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad \ - --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad + --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \ + --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \ + --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad # write the state file -cat > $DATASET_DIR/state.yaml << HERE -id: cxg_mouse_pancreas_atlas +cat > $DATASET_DIR/cxg_immune_cell_atlas/state.yaml << HERE +id: cxg_immune_cell_atlas output_dataset: !file dataset.h5ad output_solution: !file solution.h5ad output_integrated: !file integrated.h5ad output_integrated_full: !file integrated_full.h5ad -output_score: !file score.h5ad +output_score: !file score_mod1.h5ad HERE # only run this if you have access to the openproblems-data bucket diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 8f4bc92a..1e980239 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -25,7 +25,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --compute-env 6UWsS5iw7TI37saKo2wcMi \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh index 64056313..3645ad0f 100755 --- a/scripts/run_benchmark/run_test_seqeracloud.sh +++ b/scripts/run_benchmark/run_test_seqeracloud.sh @@ -21,7 +21,7 @@ tw launch https://github.com/openproblems-bio/task_batch_integration.git \ --pull-latest \ --main-script target/nextflow/workflows/run_benchmark/main.nf \ --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --compute-env 6UWsS5iw7TI37saKo2wcMi \ --params-file /tmp/params.yaml \ --entry-name auto \ --config common/nextflow_helpers/labels_tw.config \ diff --git a/src/api/base_method.yaml b/src/api/base_method.yaml new file mode 100644 index 00000000..ed3d5938 --- /dev/null +++ b/src/api/base_method.yaml @@ -0,0 +1,20 @@ +namespace: methods +info: + type: method + type_info: + label: Method + summary: A method for the batch integration task. + description: | + A batch integration method which integrates multiple datasets. +arguments: + - name: --input + __merge__: file_dataset.yaml + direction: input + required: true + - name: --output + __merge__: file_integrated.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/check_config.py diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 0ca176f6..b8e1ebd3 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -24,5 +24,5 @@ test_resources: path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index dda52ce0..571c9565 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -1,24 +1,6 @@ -namespace: methods -info: - type: method - type_info: - label: Method - summary: A method for the batch integration task. - description: | - A batch integration method which integrates multiple datasets. -arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated.yaml - direction: output - required: true +__merge__: base_method.yaml test_resources: - - type: python_script - path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index 73eee377..bc57056a 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -24,5 +24,5 @@ test_resources: path: /common/component_tests/check_config.py - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml index b2b449aa..067a5c3d 100644 --- a/src/api/comp_process_dataset.yaml +++ b/src/api/comp_process_dataset.yaml @@ -25,7 +25,7 @@ arguments: default: 2000 required: false test_resources: - - path: /resources_test/common/cxg_mouse_pancreas_atlas/ - dest: resources_test/common/cxg_mouse_pancreas_atlas/ + - path: /resources_test/common/cxg_immune_cell_atlas/ + dest: resources_test/common/cxg_immune_cell_atlas/ - type: python_script - path: /common/component_tests/run_and_check_output.py \ No newline at end of file + path: /common/component_tests/run_and_check_output.py diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml index eb347298..b68a9c37 100644 --- a/src/api/comp_transformer.yaml +++ b/src/api/comp_transformer.yaml @@ -6,7 +6,7 @@ info: summary: Check the output and transform to create additional output types description: | This component will: - + - Assert whether the input dataset and integrated dataset have the same shape. - Reorder the integrated dataset to match the input dataset if needed. - Transform the corrected feature output to an embedding. @@ -26,7 +26,7 @@ arguments: required: true multiple: true description: | - The expected output types of the batch integration method. + The expected output types of the batch integration method. choices: [ feature, embedding, graph ] - name: --output __merge__: file_integrated_full.yaml @@ -35,5 +35,5 @@ arguments: test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas - dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas + - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas + dest: resources_test/task_batch_integration/cxg_immune_cell_atlas diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml index 1399f0b2..171fdeb6 100644 --- a/src/api/file_common_dataset.yaml +++ b/src/api/file_common_dataset.yaml @@ -2,7 +2,7 @@ # `src/datasets/api/file_common_dataset.yaml`. However, some fields # such as obs.cell_type and obs.batch are now required type: file -example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad" +example: "resources_test/common/cxg_immune_cell_atlas/dataset.h5ad" label: "Common Dataset" summary: A subset of the common dataset. info: diff --git a/src/api/file_dataset.yaml b/src/api/file_dataset.yaml index 8f60192b..a76ae203 100644 --- a/src/api/file_dataset.yaml +++ b/src/api/file_dataset.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad" label: "Dataset" summary: Unintegrated AnnData HDF5 file. info: diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml index abd6df29..7920fcd0 100644 --- a/src/api/file_integrated.yaml +++ b/src/api/file_integrated.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad" label: Integration summary: An integrated AnnData dataset. description: | diff --git a/src/api/file_integrated_full.yaml b/src/api/file_integrated_full.yaml index 4d02f596..cdedb854 100644 --- a/src/api/file_integrated_full.yaml +++ b/src/api/file_integrated_full.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad" label: Transformed integration summary: An integrated AnnData dataset with additional outputs. description: | @@ -8,7 +8,7 @@ description: | - Feature: the corrected_counts layer - Embedding: the X_emb obsm - Graph: the connectivities and distances obsp - + The Graph should always be present, but the Feature and Embedding are optional. info: format: diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml index 35e0c7ea..562bfa22 100644 --- a/src/api/file_solution.yaml +++ b/src/api/file_solution.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad" +example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad" label: "Solution" summary: Uncensored dataset containing the true labels. info: diff --git a/src/control_methods/embed_cell_types/script.py b/src/control_methods/embed_cell_types/script.py index 5482d301..f6f1961b 100644 --- a/src/control_methods/embed_cell_types/script.py +++ b/src/control_methods/embed_cell_types/script.py @@ -2,11 +2,11 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { 'functionality': 'foo', 'config': 'bar' } diff --git a/src/control_methods/embed_cell_types_jittered/script.py b/src/control_methods/embed_cell_types_jittered/script.py index 9ad3e743..06180464 100644 --- a/src/control_methods/embed_cell_types_jittered/script.py +++ b/src/control_methods/embed_cell_types_jittered/script.py @@ -4,13 +4,13 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', + 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', 'output': 'output.h5ad', 'jitter': 0.01, } -meta = { +meta = { 'functionality': 'foo', 'config': 'bar' } diff --git a/src/control_methods/no_integration/script.py b/src/control_methods/no_integration/script.py index 0c1581be..df7b280d 100644 --- a/src/control_methods/no_integration/script.py +++ b/src/control_methods/no_integration/script.py @@ -2,7 +2,7 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } ## VIASH END diff --git a/src/control_methods/no_integration_batch/script.py b/src/control_methods/no_integration_batch/script.py index 8324acf9..1f62763c 100644 --- a/src/control_methods/no_integration_batch/script.py +++ b/src/control_methods/no_integration_batch/script.py @@ -5,11 +5,11 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { 'functionality': 'foo', 'config': 'bar' } @@ -46,4 +46,4 @@ print("Store outputs", flush=True) adata.uns['method_id'] = meta['name'] -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/control_methods/shuffle_integration/script.py b/src/control_methods/shuffle_integration/script.py index 91a542af..e1f29318 100644 --- a/src/control_methods/shuffle_integration/script.py +++ b/src/control_methods/shuffle_integration/script.py @@ -3,10 +3,10 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { "resources_dir": "src/tasks/batch_integration/control_methods/" } ## VIASH END diff --git a/src/control_methods/shuffle_integration_by_batch/script.py b/src/control_methods/shuffle_integration_by_batch/script.py index c7d35171..a9b63edc 100644 --- a/src/control_methods/shuffle_integration_by_batch/script.py +++ b/src/control_methods/shuffle_integration_by_batch/script.py @@ -3,10 +3,10 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { "resources_dir": "src/tasks/batch_integration/control_methods/" } ## VIASH END diff --git a/src/control_methods/shuffle_integration_by_cell_type/script.py b/src/control_methods/shuffle_integration_by_cell_type/script.py index 762bd07b..0df2ba46 100644 --- a/src/control_methods/shuffle_integration_by_cell_type/script.py +++ b/src/control_methods/shuffle_integration_by_cell_type/script.py @@ -3,10 +3,10 @@ ## VIASH START par = { - 'input_dataset': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input_dataset': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } -meta = { +meta = { "resources_dir": "src/tasks/batch_integration/control_methods/" } ## VIASH END diff --git a/src/data_processors/transform/script.py b/src/data_processors/transform/script.py index dc01584a..226edca8 100644 --- a/src/data_processors/transform/script.py +++ b/src/data_processors/transform/script.py @@ -3,8 +3,8 @@ ## VIASH START par = { - "input_integrated": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad", - "input_dataset": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + "input_integrated": "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated.h5ad", + "input_dataset": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "expected_method_types": ["feature"], "ouput": "output.h5ad" } @@ -28,7 +28,7 @@ if "corrected_counts" in integrated.layers.keys(): assert integrated.shape[1] == dataset.shape[1], "Number of genes do not match" - + if not integrated.var.index.equals(dataset.var.index): assert integrated.var.index.sort_values().equals(dataset.var.index.sort_values()), "Gene names do not match" print("Reordering genes", flush=True) diff --git a/src/methods/batchelor_fastmnn/script.R b/src/methods/batchelor_fastmnn/script.R index 76791bea..879aad68 100644 --- a/src/methods/batchelor_fastmnn/script.R +++ b/src/methods/batchelor_fastmnn/script.R @@ -8,7 +8,7 @@ suppressPackageStartupMessages({ ## VIASH START par <- list( - input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', output = 'output.h5ad' ) meta <- list( diff --git a/src/methods/batchelor_mnn_correct/script.R b/src/methods/batchelor_mnn_correct/script.R index cadbcc82..4a8802af 100644 --- a/src/methods/batchelor_mnn_correct/script.R +++ b/src/methods/batchelor_mnn_correct/script.R @@ -7,7 +7,7 @@ suppressPackageStartupMessages({ }) ## VIASH START par <- list( - input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', output = 'output.h5ad' ) meta <- list( diff --git a/src/methods/bbknn/script.py b/src/methods/bbknn/script.py index 86c807ed..9c121ccb 100644 --- a/src/methods/bbknn/script.py +++ b/src/methods/bbknn/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', 'annoy_n_trees': 10, 'neighbors_within_batch': 3, diff --git a/src/methods/combat/script.py b/src/methods/combat/script.py index 155c1621..ab251363 100644 --- a/src/methods/combat/script.py +++ b/src/methods/combat/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/harmony/script.R b/src/methods/harmony/script.R index e5cb2c5b..595e3f19 100644 --- a/src/methods/harmony/script.R +++ b/src/methods/harmony/script.R @@ -5,7 +5,7 @@ requireNamespace("harmony", quietly = TRUE) ## VIASH START par <- list( - input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + input = 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', output = 'output.h5ad' ) meta <- list( diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py index 79b32537..ec851953 100644 --- a/src/methods/harmonypy/script.py +++ b/src/methods/harmonypy/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", "output": "output.h5ad" } meta = { diff --git a/src/methods/liger/script.R b/src/methods/liger/script.R index 62dec598..e5b7e451 100644 --- a/src/methods/liger/script.R +++ b/src/methods/liger/script.R @@ -4,7 +4,7 @@ requireNamespace("rliger", quietly = TRUE) ## VIASH START par <- list( - input = "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad", + input = "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", output = "output.h5ad" ) meta <- list( diff --git a/src/methods/mnnpy/script.py b/src/methods/mnnpy/script.py index a9dfd8a8..7100da10 100644 --- a/src/methods/mnnpy/script.py +++ b/src/methods/mnnpy/script.py @@ -3,7 +3,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/pyliger/script.py b/src/methods/pyliger/script.py index 603b6d04..c6bd5f0e 100644 --- a/src/methods/pyliger/script.py +++ b/src/methods/pyliger/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad' } meta = { @@ -31,7 +31,7 @@ adata_per_batch = [] for batch in adata.obs['batch'].unique(): adb = adata[adata.obs['batch'] == batch].copy() - + # save row sum and sum of squares for further use norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) norm_sum_sq = np.ravel(np.sum(adb.layers["norm_data"].power(2), axis=0)) diff --git a/src/methods/scalex/script.py b/src/methods/scalex/script.py index 887a989d..7d09f02f 100644 --- a/src/methods/scalex/script.py +++ b/src/methods/scalex/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama/script.py index 8f99418c..2ddb91df 100644 --- a/src/methods/scanorama/script.py +++ b/src/methods/scanorama/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/methods/scanvi/script.py b/src/methods/scanvi/script.py index 882d7ff6..5a17d2e9 100644 --- a/src/methods/scanvi/script.py +++ b/src/methods/scanvi/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', 'n_hvg': 2000, 'n_latent': 30, diff --git a/src/methods/scimilarity/config.vsh.yaml b/src/methods/scimilarity/config.vsh.yaml new file mode 100644 index 00000000..02b6527c --- /dev/null +++ b/src/methods/scimilarity/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: /src/api/base_method.yaml +name: scimilarity +label: SCimilarity +summary: SCimilarity provides unifying representation of single cell expression profiles +description: | + SCimilarity is a unifying representation of single cell expression profiles that quantifies similarity between expression states and generalizes to represent new studies without additional training +references: + doi: 10.1101/2023.07.18.549537 +links: + repository: https://github.com/Genentech/scimilarity + documentation: https://genentech.github.io/scimilarity/index.html +info: + method_types: [embedding] + preferred_normalization: counts +arguments: + - name: --model + type: file + description: Path to the directory containing SCimilarity models or a .zip/.tar.gz archive + required: true +resources: + - type: python_script + path: script.py + - path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + github: Genentech/scimilarity +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/methods/scimilarity/script.py b/src/methods/scimilarity/script.py new file mode 100644 index 00000000..2da1790e --- /dev/null +++ b/src/methods/scimilarity/script.py @@ -0,0 +1,112 @@ +import os +import sys +import tempfile +import zipfile +import tarfile + +import anndata as ad +import scimilarity + +## VIASH START +par = { + "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad", + "output": "output.h5ad", + "model": "model_v1.1", +} +meta = { + "name": "scvi", +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print("Read input", flush=True) +adata = read_anndata(par["input"], X="layers/counts", obs="obs", var="var", uns="uns") + +if adata.uns["dataset_organism"] != "homo_sapiens": + raise ValueError( + f"SCimilarity can only be used with human data " + f"(dataset_organism == \"{adata.uns['dataset_organism']}\")" + ) + +if os.path.isdir(par["model"]): + model_temp = None + model_dir = par["model"] +else: + model_temp = tempfile.TemporaryDirectory() + model_dir = model_temp.name + + if zipfile.is_zipfile(par["model"]): + print("Extract SCimilarity model from .zip", flush=True) + with zipfile.ZipFile(par["model"], "r") as zip_file: + zip_file.extractall(model_dir) + elif tarfile.is_tarfile(par["model"]) and par["model"].endswith(".tar.gz"): + print("Extract SCimilarity model from .tar.gz", flush=True) + with tarfile.open(par["model"], "r:gz") as tar_file: + tar_file.extractall(model_dir) + model_dir = os.path.join(model_dir, os.listdir(model_dir)[0]) + else: + raise ValueError( + f"The 'model' argument should be a directory a .zip file or a .tar.gz file" + ) + +print("Load SCimilarity model", flush=True) +scimilarity_embedding = scimilarity.cell_embedding.CellEmbedding(model_path=model_dir) +print("SCimilarity version:", scimilarity.__version__) + +print("Create input data", flush=True) +# Some of the functions modify the adata so make sure we have a copy +input = ad.AnnData(X=adata.X.copy(), layers={"counts": adata.X.copy()}) +# Set input.var_names to gene symbols +input.var_names = adata.var["feature_name"] + +print("Align datasets", flush=True) + +# Check the number of genes in the dataset and reduce the overlap threshold if +# necessary (mostly for subsampled test datasets) +gene_overlap_threshold = 5000 +if 0.8 * input.n_vars < gene_overlap_threshold: + from warnings import warn + + warn( + f"The number of genes in the dataset ({input.n_vars}) " + f"is less than or close to {gene_overlap_threshold}. " + f"Setting gene_overlap_threshold to 0.8 * n_var ({int(0.8 * input.n_vars)})." + ) + gene_overlap_threshold = int(0.8 * input.n_vars) + +input = scimilarity.utils.align_dataset( + input, + scimilarity_embedding.gene_order, + gene_overlap_threshold=gene_overlap_threshold, +) +input = scimilarity.utils.consolidate_duplicate_symbols(input) + +print("Normalizing dataset", flush=True) +input = scimilarity.utils.lognorm_counts(input) + +print("Get cell embeddings", flush=True) +cell_embeddings = scimilarity_embedding.get_embeddings(input.X) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": cell_embeddings, + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) +print(output) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") + +if model_temp is not None: + print("Cleanup model directory", flush=True) + model_temp.cleanup() diff --git a/src/methods/scvi/script.py b/src/methods/scvi/script.py index b6836b49..20f1cf32 100644 --- a/src/methods/scvi/script.py +++ b/src/methods/scvi/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad', + 'input': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad', 'output': 'output.h5ad', 'n_hvg': 2000, 'n_latent': 30, diff --git a/src/metrics/asw_batch/script.py b/src/metrics/asw_batch/script.py index d6dafcfe..4a7269da 100644 --- a/src/metrics/asw_batch/script.py +++ b/src/metrics/asw_batch/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/asw_label/script.py b/src/metrics/asw_label/script.py index 499a06f9..e307aaac 100644 --- a/src/metrics/asw_label/script.py +++ b/src/metrics/asw_label/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } diff --git a/src/metrics/cell_cycle_conservation/script.py b/src/metrics/cell_cycle_conservation/script.py index 9ad38422..b254f4f8 100644 --- a/src/metrics/cell_cycle_conservation/script.py +++ b/src/metrics/cell_cycle_conservation/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad' } diff --git a/src/metrics/clustering_overlap/script.py b/src/metrics/clustering_overlap/script.py index 30fe1704..2254acb0 100644 --- a/src/metrics/clustering_overlap/script.py +++ b/src/metrics/clustering_overlap/script.py @@ -6,7 +6,7 @@ ## VIASH START par = { - 'adata_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'adata_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -50,4 +50,4 @@ ) print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/metrics/graph_connectivity/script.py b/src/metrics/graph_connectivity/script.py index 0c92a35a..6148884e 100644 --- a/src/metrics/graph_connectivity/script.py +++ b/src/metrics/graph_connectivity/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index 8ecda9bc..b902fe08 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -4,8 +4,8 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', + 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/isolated_label_asw/script.py b/src/metrics/isolated_label_asw/script.py index 39d23568..602e8d16 100644 --- a/src/metrics/isolated_label_asw/script.py +++ b/src/metrics/isolated_label_asw/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -46,4 +46,4 @@ ) print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/isolated_label_f1/script.py b/src/metrics/isolated_label_f1/script.py index a6529adb..2737f244 100644 --- a/src/metrics/isolated_label_f1/script.py +++ b/src/metrics/isolated_label_f1/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -45,4 +45,4 @@ ) print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/kbet/script.py b/src/metrics/kbet/script.py index 6c74c261..89bd799e 100644 --- a/src/metrics/kbet/script.py +++ b/src/metrics/kbet/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -46,4 +46,4 @@ ) print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/lisi/script.py b/src/metrics/lisi/script.py index b50f6e62..c0c564cd 100644 --- a/src/metrics/lisi/script.py +++ b/src/metrics/lisi/script.py @@ -5,7 +5,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } meta = { diff --git a/src/metrics/pcr/script.py b/src/metrics/pcr/script.py index 265ad430..0ae18ddb 100644 --- a/src/metrics/pcr/script.py +++ b/src/metrics/pcr/script.py @@ -4,7 +4,7 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad', + 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', 'output': 'output.h5ad', } @@ -59,4 +59,4 @@ print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index f70f9b43..3ed43a1e 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -80,6 +80,7 @@ dependencies: - name: methods/scalex - name: methods/scanorama - name: methods/scanvi + - name: methods/scimilarity - name: methods/scvi # metrics - name: metrics/asw_batch diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index ff77ad8d..2eff6d8d 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -26,6 +26,9 @@ methods = [ scalex, scanorama, scanvi, + scimilarity.run( + args: [model: file("s3://openproblems-work/cache/scimilarity-model_v1.1.tar.gz")] + ), scvi ] @@ -55,7 +58,7 @@ workflow run_wf { ****************************/ dataset_ch = input_ch // store join id - | map{ id, state -> + | map{ id, state -> [id, state + ["_meta": [join_id: id]]] } @@ -153,7 +156,7 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", + input_solution: "input_solution", input_integrated: "method_output_cleaned" ], // use 'toState' to publish that component's outputs to the overall state