From 9dcaf38eb30a9fc0375e4836d7b169451710a8f7 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 11 Apr 2024 14:02:45 -0400
Subject: [PATCH 1/3] Update code to use h5ad

changed argument to predict_cellassign to make things conistent
---
 bin/predict_cellassign.py     | 18 +++++++++---------
 bin/sce_to_anndata.R          |  2 +-
 merge.nf                      | 10 +++++-----
 modules/classify-celltypes.nf |  4 ++--
 modules/export-anndata.nf     | 34 +++++++++++++++++-----------------
 5 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py
index 98905b84..22fc6d32 100755
--- a/bin/predict_cellassign.py
+++ b/bin/predict_cellassign.py
@@ -18,8 +18,8 @@
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "-i",
-    "--input_hdf5_file",
-    dest="input_hdf5_file",
+    "--anndata_file",
+    dest="anndata_file",
     required=True,
     help="Path to HDF5 file with processed AnnData object to annotate",
 )
@@ -63,14 +63,14 @@
 scvi.settings.num_threads = args.threads
 
 # compile extension regex
-file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)
+file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
 
 # check that input file exists, if it does exist, make sure it's an h5 file
-if not os.path.exists(args.input_hdf5_file):
-    raise FileExistsError("--input_hdf5_file file not found.")
-elif not file_ext.search(args.input_hdf5_file):
+if not os.path.exists(args.anndata_file):
+    raise FileExistsError("--anndata_file file not found.")
+elif not file_ext.search(args.anndata_file):
     raise ValueError(
-        "--input_hdf5_file must end in either .hdf5 or .h5 and contain a processed AnnData object."
+        "--anndata_file must end in .h5ad, .hdf5, or .h5 and contain a processed AnnData object."
     )
 
 # check that marker file exists and make sure its a tsv
@@ -85,7 +85,7 @@
 ref_matrix = pd.read_csv(args.reference, sep="\t", index_col="ensembl_id")
 
 # file path to annotated sce
-annotated_adata = adata.read_h5ad(args.input_hdf5_file)
+annotated_adata = adata.read_h5ad(args.anndata_file)
 
 # subset anndata to contain only genes in the reference file
 # note that the gene names must be the rownames of the reference matrix
@@ -95,7 +95,7 @@
 # check that shared_genes actually has some genes
 if not shared_genes:
     raise ValueError(
-        "--reference does not include any genes found in the provided --input_hdf5_file."
+        "--reference does not include any genes found in the provided --anndata_file."
     )
 
 # create a new anndata object with only shared genes
diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
index 98098d0c..7e8570a6 100755
--- a/bin/sce_to_anndata.R
+++ b/bin/sce_to_anndata.R
@@ -21,7 +21,7 @@ option_list <- list(
   make_option(
     opt_str = c("--output_rna_h5"),
     type = "character",
-    help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5 or .h5"
+    help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
   ),
   make_option(
     opt_str = c("--feature_name"),
diff --git a/merge.nf b/merge.nf
index aebbaebb..8a6abcdb 100644
--- a/merge.nf
+++ b/merge.nf
@@ -92,10 +92,10 @@ process export_anndata {
     input:
       tuple path(merged_sce_file), val(merge_group_id), val(has_adt)
     output:
-      tuple val(merge_group_id), path("${merge_group_id}_merged_*.hdf5")
+      tuple val(merge_group_id), path("${merge_group_id}_merged_*.h5ad")
     script:
-      rna_hdf5_file = "${merge_group_id}_merged_rna.hdf5"
-      feature_hdf5_file = "${merge_group_id}_merged_adt.hdf5"
+      rna_hdf5_file = "${merge_group_id}_merged_rna.h5ad"
+      feature_hdf5_file = "${merge_group_id}_merged_adt.h5ad"
       """
       sce_to_anndata.R \
         --input_sce_file ${merged_sce_file} \
@@ -109,8 +109,8 @@ process export_anndata {
       ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_hdf5_file}" : ''}
       """
     stub:
-      rna_hdf5_file = "${merge_group_id}_merged_rna.hdf5"
-      feature_hdf5_file = "${merge_group_id}_merged_adt.hdf5"
+      rna_hdf5_file = "${merge_group_id}_merged_rna.h5ad"
+      feature_hdf5_file = "${merge_group_id}_merged_adt.h5ad"
       """
       touch ${rna_hdf5_file}
       ${has_adt ? "touch ${feature_hdf5_file}" : ''}
diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
index 67846a73..fec08a3a 100644
--- a/modules/classify-celltypes.nf
+++ b/modules/classify-celltypes.nf
@@ -67,11 +67,11 @@ process classify_cellassign {
     # Convert SCE to AnnData
     sce_to_anndata.R \
         --input_sce_file "${processed_rds}" \
-        --output_rna_h5 "processed.hdf5"
+        --output_rna_h5 "processed.h5ad"
 
     # Run CellAssign
     predict_cellassign.py \
-      --input_hdf5_file "processed.hdf5" \
+      --anndata_file "processed.h5ad" \
       --output_predictions "${cellassign_dir}/cellassign_predictions.tsv" \
       --reference "${cellassign_reference_file}" \
       --seed ${params.seed} \
diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf
index 30f73b48..e4585389 100644
--- a/modules/export-anndata.nf
+++ b/modules/export-anndata.nf
@@ -8,36 +8,36 @@ process export_anndata {
     input:
       tuple val(meta), path(sce_file), val(file_type)
     output:
-      tuple val(meta), path("${meta.library_id}_${file_type}_*.hdf5"), val(file_type)
+      tuple val(meta), path("${meta.library_id}_${file_type}_*.h5ad"), val(file_type)
     script:
-      rna_hdf5_file = "${meta.library_id}_${file_type}_rna.hdf5"
-      feature_hdf5_file = "${meta.library_id}_${file_type}_${meta.feature_type}.hdf5"
+      rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
+      feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
       feature_present = meta.feature_type in ["adt"]
       """
       sce_to_anndata.R \
         --input_sce_file ${sce_file} \
-        --output_rna_h5 ${rna_hdf5_file} \
-        --output_feature_h5 ${feature_hdf5_file} \
+        --output_rna_h5 ${rna_h5ad_file} \
+        --output_feature_h5 ${feature_h5ad_file} \
         ${feature_present ? "--feature_name ${meta.feature_type}" : ''} \
         ${file_type != "processed" ? "--compress_output" : ''}
 
       # move any normalized counts to X in AnnData
       if [ "${file_type}" = "processed" ]; then
-        move_counts_anndata.py --anndata_file ${rna_hdf5_file}
+        move_counts_anndata.py --anndata_file ${rna_h5ad_file}
         # move counts in feature data, if the file exists
-        if [ -f "${feature_hdf5_file}" ]; then
-          move_counts_anndata.py --anndata_file ${feature_hdf5_file}
+        if [ -f "${feature_h5ad_file}" ]; then
+          move_counts_anndata.py --anndata_file ${feature_h5ad_file}
         fi
       fi
 
       """
     stub:
-      rna_hdf5_file = "${meta.library_id}_${file_type}_rna.hdf5"
-      feature_hdf5_file = "${meta.library_id}_${file_type}_${meta.feature_type}.hdf5"
+      rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
+      feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
       feature_present = meta.feature_type in ["adt"]
       """
-      touch ${rna_hdf5_file}
-      ${feature_present ? "touch ${feature_hdf5_file}" : ''}
+      touch ${rna_h5ad_file}
+      ${feature_present ? "touch ${feature_h5ad_file}" : ''}
       """
 }
 
@@ -69,16 +69,16 @@ workflow sce_to_anndata {
 
       // combine all anndata files by library id
       anndata_ch = export_anndata.out
-        .map{ meta, hdf5_files, file_type -> tuple(
+        .map{ meta, h5ad_files, file_type -> tuple(
           meta.library_id, // pull out library id for grouping
           meta,
-          hdf5_files // either rna.hdf5 or [ rna.hdf5, feature.hdf5 ]
+          h5ad_files // either rna.h5ad or [ rna.h5ad, feature.h5ad ]
         )}
         // group by library id result is
-        // [library id, [meta, meta, meta], [hdf5 files]]
+        // [library id, [meta, meta, meta], [h5ad files]]
         .groupTuple(by: 0, size: 3, remainder: true)
-        // pull out just 1 meta object and hdf5 files
-        // [meta, [hdf5 files]]
+        // pull out just 1 meta object and h5ad files
+        // [meta, [h5ad files]]
         .map{ [it[1][0]] +  it[2] }
 
     emit: anndata_ch

From 4b0be73c7d03721c6023e163ffcfe17672888e6e Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 11 Apr 2024 14:03:15 -0400
Subject: [PATCH 2/3] Change filenames in docs

---
 external-instructions.md | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/external-instructions.md b/external-instructions.md
index ba4c7fcb..c81bcf6c 100644
--- a/external-instructions.md
+++ b/external-instructions.md
@@ -461,11 +461,11 @@ All files in that folder will be prefixed by the library ID.
 
 The files with the suffixes `_unfiltered.rds`, `_filtered.rds`, and `_processed.rds` provide quantified gene expression data as [`SingleCellExperiment` objects](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html).
 
-The files with the suffixes `_unfiltered_rna.hdf5`, `_filtered_rna.hdf5`, and `_processed_rna.hdf5` provide the quantified gene expression data as [`AnnData` objects](https://anndata.readthedocs.io/en/latest/).
-If the input data contains libraries with ADT tags, three additional files with the suffixes `_unfiltered_adt.hdf5`, `_filtered_adt.hdf5`, and `_processed_adt.hdf5`will be provided for each library.
+The files with the suffixes `_unfiltered_rna.h5ad`, `_filtered_rna.h5ad`, and `_processed_rna.h5ad` provide the quantified gene expression data as [`AnnData` objects](https://anndata.readthedocs.io/en/latest/).
+If the input data contains libraries with ADT tags, three additional files with the suffixes `_unfiltered_adt.h5ad`, `_filtered_adt.h5ad`, and `_processed_adt.h5ad`will be provided for each library.
 These files contain the quantified ADT tag data as an [`AnnData` object](https://anndata.readthedocs.io/en/latest/).
 
-**Note: We currently do not output `AnnData` objects (`.hdf5` files) for any multiplexed libraries.
+**Note: We currently do not output `AnnData` objects (`.h5ad` files) for any multiplexed libraries.
 Only `SingleCellExperiment` objects (`.rds` files) will be provided for multiplexed libraries.**
 
 For more information on the contents of these files, see the [ScPCA portal docs section on single cell gene expression file contents](https://scpca.readthedocs.io/en/latest/sce_file_contents.html).
@@ -478,9 +478,9 @@ results
     ├── library_id_unfiltered.rds
     ├── library_id_filtered.rds
     ├── library_id_processed.rds
-    ├── library_id_unfiltered_rna.hdf5
-    ├── library_id_filtered_rna.hdf5
-    ├── library_id_processed_rna.hdf5
+    ├── library_id_unfiltered_rna.h5ad
+    ├── library_id_filtered_rna.h5ad
+    ├── library_id_processed_rna.h5ad
     ├── library_id_metadata.json
     └── library_id_qc.html
 ```
@@ -687,7 +687,7 @@ nextflow run AlexsLemonade/scpca-nf/merge.nf \
 
 ### Output files
 
-The `merge.nf` workflow will output, for each specified project ID, an `.rds` file containing a merged `SingleCellExperiment` object, an `.hdf5` file containing a merged `AnnData` object, and a report which provides a brief summary of the types of libraries and their samples' diagnoses included in the merged object, as well as UMAP visualizations highlighting each library.
+The `merge.nf` workflow will output, for each specified project ID, an `.rds` file containing a merged `SingleCellExperiment` object, an `.h5ad` file containing a merged `AnnData` object, and a report which provides a brief summary of the types of libraries and their samples' diagnoses included in the merged object, as well as UMAP visualizations highlighting each library.
 
 These output files will follow this structure:
 
@@ -695,7 +695,7 @@ These output files will follow this structure:
 merged
 └── <project_id>
     ├── <project_id>_merged.rds
-    ├── <project_id>_merged_rna.hdf5
+    ├── <project_id>_merged_rna.h5ad
     └── <project_id>_merged-summary-report.html
 ```
 
@@ -708,15 +708,15 @@ There are some additional considerations to be aware of for libraries which cont
 
 If any libraries in a merge group have ADT counts, these counts will also be merged and included in the final merged object.
 In the case of `SingleCellExperiment` objects, ADT counts will be provided as an alternative experiment called `"adt"` in same object.
-In the case of `AnnData`, a separate file will be exported with the extension `_adt.hdf5` that contains the merged ADT counts.
+In the case of `AnnData`, a separate file will be exported with the extension `_adt.h5ad` that contains the merged ADT counts.
 The output files will follow this structure if CITE-seq data is present:
 
 ```
 merged
 └── <project_id>
     ├── <project_id>_merged.rds
-    ├── <project_id>_merged_rna.hdf5
-    ├── <project_id>_merged_adt.hdf5
+    ├── <project_id>_merged_rna.h5ad
+    ├── <project_id>_merged_adt.h5ad
     └── <project_id>_merged-summary-report.html
 ```
 

From f4715c12f9d63944b9df610fb8d727babed7f4e4 Mon Sep 17 00:00:00 2001
From: Joshua Shapiro <josh.shapiro@ccdatalab.org>
Date: Thu, 11 Apr 2024 14:05:27 -0400
Subject: [PATCH 3/3] change variable names too

---
 merge.nf | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/merge.nf b/merge.nf
index 8a6abcdb..83d45f84 100644
--- a/merge.nf
+++ b/merge.nf
@@ -94,26 +94,26 @@ process export_anndata {
     output:
       tuple val(merge_group_id), path("${merge_group_id}_merged_*.h5ad")
     script:
-      rna_hdf5_file = "${merge_group_id}_merged_rna.h5ad"
-      feature_hdf5_file = "${merge_group_id}_merged_adt.h5ad"
+      rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
+      feature_h5ad_file = "${merge_group_id}_merged_adt.h5ad"
       """
       sce_to_anndata.R \
         --input_sce_file ${merged_sce_file} \
-        --output_rna_h5 ${rna_hdf5_file} \
-        --output_feature_h5 ${feature_hdf5_file} \
+        --output_rna_h5 ${rna_h5ad_file} \
+        --output_feature_h5 ${feature_h5ad_file} \
         --is_merged \
         ${has_adt ? "--feature_name adt" : ''}
 
       # move normalized counts to X in AnnData
-      move_counts_anndata.py --anndata_file ${rna_hdf5_file}
-      ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_hdf5_file}" : ''}
+      move_counts_anndata.py --anndata_file ${rna_h5ad_file}
+      ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
       """
     stub:
-      rna_hdf5_file = "${merge_group_id}_merged_rna.h5ad"
-      feature_hdf5_file = "${merge_group_id}_merged_adt.h5ad"
+      rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
+      feature_h5ad_file = "${merge_group_id}_merged_adt.h5ad"
       """
-      touch ${rna_hdf5_file}
-      ${has_adt ? "touch ${feature_hdf5_file}" : ''}
+      touch ${rna_h5ad_file}
+      ${has_adt ? "touch ${feature_h5ad_file}" : ''}
       """
 }