openproblems-bio · rcannood · Jan 10, 2025 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/README.md b/README.md
@@ -66,25 +66,25 @@ flowchart TB
   file_solution("<a href='https://github.com/openproblems-bio/task_batch_integration#file-format-solution'>Solution</a>")
   comp_control_method[/"<a href='https://github.com/openproblems-bio/task_batch_integration#component-type-control-method'>Control method</a>"/]
   comp_method[/"<a href='https://github.com/openproblems-bio/task_batch_integration#component-type-method'>Method</a>"/]
-  comp_transformer[/"<a href='https://github.com/openproblems-bio/task_batch_integration#component-type-transform'>Transform</a>"/]
+  comp_process_integration[/"<a href='https://github.com/openproblems-bio/task_batch_integration#component-type-process-integration'>Process integration</a>"/]
   comp_metric[/"<a href='https://github.com/openproblems-bio/task_batch_integration#component-type-metric'>Metric</a>"/]
   file_integrated("<a href='https://github.com/openproblems-bio/task_batch_integration#file-format-integration'>Integration</a>")
-  file_integrated_full("<a href='https://github.com/openproblems-bio/task_batch_integration#file-format-transformed-integration'>Transformed integration</a>")
+  file_integrated_processed("<a href='https://github.com/openproblems-bio/task_batch_integration#file-format-processed-integration-output'>Processed integration output</a>")
   file_score("<a href='https://github.com/openproblems-bio/task_batch_integration#file-format-score'>Score</a>")
   file_common_dataset---comp_process_dataset
   comp_process_dataset-->file_dataset
   comp_process_dataset-->file_solution
   file_dataset---comp_control_method
   file_dataset---comp_method
-  file_dataset---comp_transformer
+  file_dataset---comp_process_integration
   file_solution---comp_control_method
   file_solution---comp_metric
   comp_control_method-->file_integrated
   comp_method-->file_integrated
-  comp_transformer-->file_integrated_full
+  comp_process_integration-->file_integrated_processed
   comp_metric-->file_score
-  file_integrated---comp_transformer
-  file_integrated_full---comp_metric
+  file_integrated---comp_process_integration
+  file_integrated_processed---comp_metric
 ```
 
 ## File format: Common Dataset
@@ -276,18 +276,19 @@ Arguments:
 
 </div>
 
-## Component type: Transform
+## Component type: Process integration
 
-Check the output and transform to create additional output types
+Process output from an integration method to the format expected by
+metrics
 
 Arguments:
 
 <div class="small">
 
 | Name | Type | Description |
 |:---|:---|:---|
-| `--input_integrated` | `file` | An integrated AnnData dataset. |
 | `--input_dataset` | `file` | Unintegrated AnnData HDF5 file. |
+| `--input_integrated` | `file` | An integrated AnnData dataset. |
 | `--expected_method_types` | `string` | NA. |
 | `--expected_method_types` | `string` | NA. |
 | `--expected_method_types` | `string` | NA. |
@@ -356,12 +357,12 @@ Data structure:
 
 </div>
 
-## File format: Transformed integration
+## File format: Processed integration output
 
 An integrated AnnData dataset with additional outputs.
 
 Example file:
-`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad`
+`resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad`
 
 Description:
 
@@ -379,7 +380,7 @@ Format:
 <div class="small">
 
     AnnData object
-     obsm: 'X_emb'
+     obsm: 'X_emb', 'clustering'
      obsp: 'connectivities', 'distances'
      layers: 'corrected_counts'
      uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors'
@@ -393,6 +394,7 @@ Data structure:
 | Slot | Type | Description |
 |:---|:---|:---|
 | `obsm["X_emb"]` | `double` | (*Optional*) Embedding output - 2D coordinate matrix. |
+| `obsm["clustering"]` | `integer` | Leiden clustering results at different resolutions. |
 | `obsp["connectivities"]` | `double` | Graph output - neighbor connectivities matrix. |
 | `obsp["distances"]` | `double` | Graph output - neighbor distances matrix. |
 | `layers["corrected_counts"]` | `double` | (*Optional*) Feature output - corrected counts. |

diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
@@ -14,26 +14,35 @@ DATASET_DIR=resources_test/task_batch_integration
 mkdir -p $DATASET_DIR
 
 # process dataset
-viash run src/data_processors/process_dataset/config.vsh.yaml -- \
+nextflow run . \
+  -main-script target/nextflow/workflows/process_datasets/main.nf \
+  -profile docker \
   --input "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \
-  --output_dataset "$DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad" \
-  --output_solution "$DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad"
+  --publish_dir "$DATASET_DIR/cxg_immune_cell_atlas" \
+  --output_dataset dataset.h5ad \
+  --output_solution solution.h5ad \
+  --output_state state.yaml \
+  -c common/nextflow_helpers/labels_ci.config \
 
 # run one method
 viash run src/methods/combat/config.vsh.yaml -- \
   --input $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
   --output $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad
 
-# run transformer
-viash run src/data_processors/transform/config.vsh.yaml -- \
-    --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad \
-    --input_dataset $DATASET_DIR/cxg_immune_cell_atlas/dataset.h5ad \
-    --expected_method_types feature \
-    --output $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad
+# process integration
+nextflow run . \
+  -main-script target/nextflow/data_processors/process_integration/main.nf \
+  -profile docker \
+  --input_dataset "$RAW_DATA/cxg_immune_cell_atlas/dataset.h5ad" \
+  --input_integrated "$DATASET_DIR/cxg_immune_cell_atlas/integrated.h5ad" \
+  --expected_method_types feature \
+  --publish_dir "$DATASET_DIR/cxg_immune_cell_atlas" \
+  --output integrated_processed.h5ad \
+  -c common/nextflow_helpers/labels_ci.config \
 
 # run one metric
 viash run src/metrics/graph_connectivity/config.vsh.yaml -- \
-    --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_full.h5ad \
+    --input_integrated $DATASET_DIR/cxg_immune_cell_atlas/integrated_processed.h5ad \
     --input_solution $DATASET_DIR/cxg_immune_cell_atlas/solution.h5ad \
     --output $DATASET_DIR/cxg_immune_cell_atlas/score.h5ad
 

diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
@@ -8,7 +8,7 @@ info:
       A metric for evaluating batch integration methods.
 arguments:
   - name: --input_integrated
-    __merge__: file_integrated_full.yaml
+    __merge__: file_integrated_processed.yaml
     direction: input
     required: true
   - name: --input_solution

diff --git a/src/api/comp_process_integration.yaml b/src/api/comp_process_integration.yaml
@@ -0,0 +1,45 @@
+namespace: data_processors
+info:
+  type: process_integration
+  type_info:
+    label: Process integration
+    summary: Process output from an integration method to the format expected by metrics
+    description: |
+      This component will:
+
+        - Perform transformations of the integration output
+        - Cluster the integrated data at different resolutions
+
+argument_groups:
+  - name: Inputs
+    arguments:
+      - name: "--input_dataset"
+        __merge__: /src/api/file_dataset.yaml
+        type: file
+        direction: input
+        required: true
+      - name: "--input_integrated"
+        __merge__: /src/api/file_integrated.yaml
+        type: file
+        direction: input
+        required: true
+      - name: --expected_method_types
+        type: string
+        direction: input
+        required: true
+        multiple: true
+        description: |
+          The expected output types of the batch integration method.
+        choices: [ feature, embedding, graph ]
+  - name: Outputs
+    arguments:
+      - name: "--output"
+        __merge__: file_integrated_processed.yaml
+        direction: output
+        required: true
+
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/task_batch_integration/cxg_immune_cell_atlas
+    dest: resources_test/task_batch_integration/cxg_immune_cell_atlas
diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml
diff --git a/src/api/file_integrated_full.yaml → src/api/file_integrated_processed.yaml b/src/api/file_integrated_full.yaml → src/api/file_integrated_processed.yaml
@@ -1,6 +1,6 @@
 type: file
-example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad"
-label: Transformed integration
+example: "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_processed.h5ad"
+label: Processed integration output
 summary: An integrated AnnData dataset with additional outputs.
 description: |
   Must contain at least one of:
@@ -23,6 +23,10 @@ info:
         name: X_emb
         description: Embedding output - 2D coordinate matrix
         required: false
+      - type: integer
+        name: clustering
+        description: Leiden clustering results at different resolutions.
+        required: true
     obsp:
       - type: double
         name: connectivities

diff --git a/src/data_processors/precompute_clustering_merge/config.vsh.yaml b/src/data_processors/precompute_clustering_merge/config.vsh.yaml
@@ -0,0 +1,30 @@
+name: precompute_clustering_merge
+namespace: data_processors
+label: Merge clustering precomputations
+summary: Merge the precompute results of clustering on the input dataset
+arguments:
+  - name: --input
+    type: file
+    direction: input
+    required: true
+  - name: --output
+    type: file
+    direction: output
+    required: true
+  - name: --clusterings
+    type: file
+    description: Clustering results to merge
+    direction: input
+    required: true
+    multiple: true
+resources:
+  - type: python_script
+    path: script.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/data_processors/precompute_clustering_merge/script.py b/src/data_processors/precompute_clustering_merge/script.py
@@ -0,0 +1,28 @@
+import anndata as ad
+import pandas as pd
+
+## VIASH START
+par = {
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+    "clusterings": ["output.h5ad", "output2.h5ad"],
+    "output": "output3.h5ad",
+}
+## VIASH END
+
+print("Read clusterings", flush=True)
+clusterings = []
+for clus_file in par["clusterings"]:
+    adata = ad.read_h5ad(clus_file)
+    obs_filt = adata.obs.filter(regex='leiden_[0-9.]+')
+    clusterings.append(obs_filt)
+
+print("Merge clusterings", flush=True)
+merged = pd.concat(clusterings, axis=1)
+
+print("Read input", flush=True)
+input = ad.read_h5ad(par["input"])
+
+input.obsm["clustering"] = merged
+
+print("Store outputs", flush=True)
+input.write_h5ad(par["output"], compression="gzip")
diff --git a/src/data_processors/precompute_clustering_run/config.vsh.yaml b/src/data_processors/precompute_clustering_run/config.vsh.yaml
@@ -0,0 +1,35 @@
+name: precompute_clustering_run
+namespace: data_processors
+label: Run clustering precomputations
+summary: Run clustering on the input dataset
+arguments:
+  - name: --input
+    __merge__: /src/api/file_common_dataset.yaml
+    direction: input
+    required: true
+  - name: --output
+    __merge__: /src/api/file_dataset.yaml
+    direction: output
+    required: true
+  - type: double
+    name: resolution
+    default: 0.8
+    description: Resolution parameter for clustering
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - scanpy
+          - igraph
+          - leidenalg
+runners:
+  - type: executable
+  - type: nextflow
+    directives: 
+      label: [midtime, midmem, lowcpu]
diff --git a/src/data_processors/precompute_clustering_run/script.py b/src/data_processors/precompute_clustering_run/script.py
@@ -0,0 +1,50 @@
+import sys
+import anndata as ad
+
+# check if we can use GPU
+USE_GPU = False
+try:
+    import subprocess
+    assert subprocess.run('nvidia-smi', shell=True, stdout=subprocess.DEVNULL).returncode == 0
+    from rapids_singlecell.tl import leiden
+    USE_GPU = True
+except Exception as e:
+    from scanpy.tl import leiden
+
+## VIASH START
+par = {
+    "input": "resources_test/task_batch_integration/cxg_immune_cell_atlas/dataset.h5ad",
+    "output": "output.h5ad",
+    "resolution": 0.8,
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+n_cell_cpu = 300_000
+
+print("Read input", flush=True)
+input = read_anndata(par["input"], obs='obs', obsp='obsp', uns='uns')
+
+key = f'leiden_{par["resolution"]}'
+kwargs = dict()
+if not USE_GPU:
+    kwargs |= dict(
+        flavor='igraph',
+        n_iterations=2,
+    )
+
+print(f"Run Leiden clustering with {kwargs}", flush=True)
+leiden(
+    input,
+    resolution=par["resolution"],
+    key_added=key,
+    **kwargs,
+)
+
+print("Store outputs", flush=True)
+output = ad.AnnData(
+    obs=input.obs[[key]],
+)
+output.write_h5ad(par["output"], compression="gzip")