diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 7222cc85..58ac28a1 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -6,19 +6,11 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -# remove this when you have implemented the script -echo "TODO: once the 'process_datasets' workflow is implemented, update this script to use it." -echo " Step 1: replace 'task_batch_integration' with the name of the task in the following command." -echo " Step 2: replace the rename keys parameters to fit your process_dataset inputs" -echo " Step 3: replace the settings parameter to fit your process_dataset outputs" -echo " Step 4: remove this message" -exit 1 - cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml rename_keys: 'input:output_dataset' output_state: '$id/state.yaml' -settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' +settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' publish_dir: s3://openproblems-data/resources/task_batch_integration/datasets/ HERE diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh index 82e31a82..d0fba746 100755 --- a/scripts/run_benchmark/run_test_local.sh +++ b/scripts/run_benchmark/run_test_local.sh @@ -17,7 +17,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources_test/task_batch_integration/**/state.yaml +input_states: resources_test/task_batch_integration/**/state.yaml rename_keys: 'input_dataset:output_dataset;input_solution:output_solution' output_state: "state.yaml" publish_dir: "$publish_dir" diff --git a/src/control_methods/embed_cell_types/config.vsh.yaml b/src/control_methods/embed_cell_types/config.vsh.yaml index 76fcdb78..4d7f27e8 100644 --- a/src/control_methods/embed_cell_types/config.vsh.yaml +++ b/src/control_methods/embed_cell_types/config.vsh.yaml @@ -3,8 +3,8 @@ name: embed_cell_types label: Embed cell types summary: Cells are embedded as a one-hot encoding of celltype labels description: Cells are embedded as a one-hot encoding of celltype labels - info: + method_types: [embedding] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/control_methods/embed_cell_types_jittered/config.vsh.yaml b/src/control_methods/embed_cell_types_jittered/config.vsh.yaml index 46e6c14f..e5b6329f 100644 --- a/src/control_methods/embed_cell_types_jittered/config.vsh.yaml +++ b/src/control_methods/embed_cell_types_jittered/config.vsh.yaml @@ -5,8 +5,8 @@ summary: Cells are embedded as a one-hot encoding of celltype labels, with a sma amount of random noise added to the embedding description: Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding - info: + method_types: [embedding] preferred_normalization: log_cp10k arguments: - name: --jitter diff --git a/src/control_methods/no_integration/config.vsh.yaml b/src/control_methods/no_integration/config.vsh.yaml index 251022d3..93995297 100644 --- a/src/control_methods/no_integration/config.vsh.yaml +++ b/src/control_methods/no_integration/config.vsh.yaml @@ -3,8 +3,8 @@ name: no_integration label: No integration summary: Original feature space is not modified description: Original feature space is not modified - info: + method_types: [embedding] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/control_methods/no_integration_batch/config.vsh.yaml b/src/control_methods/no_integration_batch/config.vsh.yaml index 1a74c043..f954acc9 100644 --- a/src/control_methods/no_integration_batch/config.vsh.yaml +++ b/src/control_methods/no_integration_batch/config.vsh.yaml @@ -3,8 +3,8 @@ name: no_integration_batch label: No integration by Batch summary: Cells are embedded by computing PCA independently on each batch description: Cells are embedded by computing PCA independently on each batch - info: + method_types: [embedding] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/control_methods/shuffle_integration/config.vsh.yaml b/src/control_methods/shuffle_integration/config.vsh.yaml index 7ecd6f8f..5469f331 100644 --- a/src/control_methods/shuffle_integration/config.vsh.yaml +++ b/src/control_methods/shuffle_integration/config.vsh.yaml @@ -4,6 +4,7 @@ label: Shuffle integration summary: Integrations are randomly permuted description: Integrations are randomly permuted info: + method_types: [feature] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml index d5208008..7af22c2e 100644 --- a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml +++ b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml @@ -4,6 +4,7 @@ label: Shuffle integration by batch summary: Integrations are randomly permuted within each batch description: Integrations are randomly permuted within each batch info: + method_types: [feature] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml index 9a274e92..2f7052ae 100644 --- a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml +++ b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml @@ -4,6 +4,7 @@ label: Shuffle integration by cell type summary: Integrations are randomly permuted within each cell type description: Integrations are randomly permuted within each cell type info: + method_types: [feature] preferred_normalization: log_cp10k resources: - type: python_script diff --git a/src/data_processors/transform/script.py b/src/data_processors/transform/script.py index b92bae64..dc01584a 100644 --- a/src/data_processors/transform/script.py +++ b/src/data_processors/transform/script.py @@ -19,7 +19,6 @@ print("Checking shapes", flush=True) assert integrated.shape[0] == dataset.shape[0], "Number of cells do not match" -assert integrated.shape[1] == dataset.shape[1], "Number of genes do not match" print("Checking index", flush=True) if not integrated.obs.index.equals(dataset.obs.index): @@ -27,11 +26,13 @@ print("Reordering cells", flush=True) integrated = integrated[dataset.obs.index] -if "corrected_counts" in integrated.layers.keys() and \ - not integrated.var.index.equals(dataset.var.index): - assert integrated.var.index.sort_values().equals(dataset.var.index.sort_values()), "Gene names do not match" - print("Reordering genes", flush=True) - integrated = integrated[:, dataset.var.index] +if "corrected_counts" in integrated.layers.keys(): + assert integrated.shape[1] == dataset.shape[1], "Number of genes do not match" + + if not integrated.var.index.equals(dataset.var.index): + assert integrated.var.index.sort_values().equals(dataset.var.index.sort_values()), "Gene names do not match" + print("Reordering genes", flush=True) + integrated = integrated[:, dataset.var.index] print("Checking method output based on type", flush=True) if "feature" in par["expected_method_types"]: diff --git a/src/methods/fastmnn/script.R b/src/methods/fastmnn/script.R index 9a569cb1..a6a30f7d 100644 --- a/src/methods/fastmnn/script.R +++ b/src/methods/fastmnn/script.R @@ -35,6 +35,8 @@ output <- anndata::AnnData( layers = list( corrected_counts = t(layer) ), + obs = adata$obs[, c()], + var = adata$var[, c()], obsm = list( X_emb = obsm ), diff --git a/src/methods/liger/script.R b/src/methods/liger/script.R index 07665e44..62dec598 100644 --- a/src/methods/liger/script.R +++ b/src/methods/liger/script.R @@ -98,6 +98,7 @@ output <- anndata::AnnData( normalization_id = adata$uns[["normalization_id"]], method_id = meta$name ), + obs = adata$obs[, c()], obsm = list( X_emb = lobj@H.norm[rownames(adata), , drop = FALSE] ), diff --git a/src/methods/mnn_correct/script.R b/src/methods/mnn_correct/script.R index 6c36c2a6..1780ca4d 100644 --- a/src/methods/mnn_correct/script.R +++ b/src/methods/mnn_correct/script.R @@ -34,6 +34,8 @@ output <- anndata::AnnData( normalization_id = adata$uns[["normalization_id"]], method_id = meta$name ), + obs = adata$obs[, c()], + var = adata$var[, c()], layers = list( corrected_counts = as(t(layer), "sparseMatrix") ), diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index 35100060..999bc8da 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -95,7 +95,7 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: { id, state, comp -> - def new_args = [] + def new_args = [:] if (comp.config.info.type == "method") { new_args.input = state.input_dataset } else if (comp.config.info.type == "control_method") { @@ -116,8 +116,12 @@ workflow run_wf { ) | transform.run( - fromState: [input: "method_output"], - toState: { id, state, output -> + fromState: [ + input_integrated: "method_output", + input_dataset: "input_dataset", + expected_method_types: "method_types" + ], + toState: { id, output, state -> def method_types_cleaned = [] if ("feature" in state.method_types) { method_types_cleaned += ["feature", "embedding", "graph"] @@ -132,7 +136,7 @@ workflow run_wf { method_types_cleaned: method_types_cleaned ] - [id, new_state] + new_state } ) @@ -143,7 +147,7 @@ workflow run_wf { id + "." + comp.config.name }, filter: { id, state, comp -> - comp.info.metric_type in state.method_types_cleaned + comp.config.info.metric_type in state.method_types_cleaned }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [