Merge pull request #737 from AlexsLemonade/jashapiro/h5ad-output

Change Anndata extension to h5ad
AlexsLemonade · Apr 11, 2024 · 84d5c81 · 84d5c81
2 parents 4fe5d64 + f4715c1
commit 84d5c81
Show file tree

Hide file tree

Showing 6 changed files with 51 additions and 51 deletions.
diff --git a/bin/predict_cellassign.py b/bin/predict_cellassign.py
@@ -18,8 +18,8 @@
 parser = argparse.ArgumentParser()
 parser.add_argument(
     "-i",
-    "--input_hdf5_file",
-    dest="input_hdf5_file",
+    "--anndata_file",
+    dest="anndata_file",
     required=True,
     help="Path to HDF5 file with processed AnnData object to annotate",
 )
@@ -63,14 +63,14 @@
 scvi.settings.num_threads = args.threads
 
 # compile extension regex
-file_ext = re.compile(r"\.hdf5$|.h5$", re.IGNORECASE)
+file_ext = re.compile(r"\.h5ad$|\.hdf5$|.h5$", re.IGNORECASE)
 
 # check that input file exists, if it does exist, make sure it's an h5 file
-if not os.path.exists(args.input_hdf5_file):
-    raise FileExistsError("--input_hdf5_file file not found.")
-elif not file_ext.search(args.input_hdf5_file):
+if not os.path.exists(args.anndata_file):
+    raise FileExistsError("--anndata_file file not found.")
+elif not file_ext.search(args.anndata_file):
     raise ValueError(
-        "--input_hdf5_file must end in either .hdf5 or .h5 and contain a processed AnnData object."
+        "--anndata_file must end in .h5ad, .hdf5, or .h5 and contain a processed AnnData object."
     )
 
 # check that marker file exists and make sure its a tsv
@@ -85,7 +85,7 @@
 ref_matrix = pd.read_csv(args.reference, sep="\t", index_col="ensembl_id")
 
 # file path to annotated sce
-annotated_adata = adata.read_h5ad(args.input_hdf5_file)
+annotated_adata = adata.read_h5ad(args.anndata_file)
 
 # subset anndata to contain only genes in the reference file
 # note that the gene names must be the rownames of the reference matrix
@@ -95,7 +95,7 @@
 # check that shared_genes actually has some genes
 if not shared_genes:
     raise ValueError(
-        "--reference does not include any genes found in the provided --input_hdf5_file."
+        "--reference does not include any genes found in the provided --anndata_file."
     )
 
 # create a new anndata object with only shared genes

diff --git a/bin/sce_to_anndata.R b/bin/sce_to_anndata.R
@@ -21,7 +21,7 @@ option_list <- list(
   make_option(
     opt_str = c("--output_rna_h5"),
     type = "character",
-    help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5 or .h5"
+    help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5"
   ),
   make_option(
     opt_str = c("--feature_name"),

diff --git a/external-instructions.md b/external-instructions.md
@@ -461,11 +461,11 @@ All files in that folder will be prefixed by the library ID.
 
 The files with the suffixes `_unfiltered.rds`, `_filtered.rds`, and `_processed.rds` provide quantified gene expression data as [`SingleCellExperiment` objects](https://bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html).
 
-The files with the suffixes `_unfiltered_rna.hdf5`, `_filtered_rna.hdf5`, and `_processed_rna.hdf5` provide the quantified gene expression data as [`AnnData` objects](https://anndata.readthedocs.io/en/latest/).
-If the input data contains libraries with ADT tags, three additional files with the suffixes `_unfiltered_adt.hdf5`, `_filtered_adt.hdf5`, and `_processed_adt.hdf5`will be provided for each library.
+The files with the suffixes `_unfiltered_rna.h5ad`, `_filtered_rna.h5ad`, and `_processed_rna.h5ad` provide the quantified gene expression data as [`AnnData` objects](https://anndata.readthedocs.io/en/latest/).
+If the input data contains libraries with ADT tags, three additional files with the suffixes `_unfiltered_adt.h5ad`, `_filtered_adt.h5ad`, and `_processed_adt.h5ad`will be provided for each library.
 These files contain the quantified ADT tag data as an [`AnnData` object](https://anndata.readthedocs.io/en/latest/).
 
-**Note: We currently do not output `AnnData` objects (`.hdf5` files) for any multiplexed libraries.
+**Note: We currently do not output `AnnData` objects (`.h5ad` files) for any multiplexed libraries.
 Only `SingleCellExperiment` objects (`.rds` files) will be provided for multiplexed libraries.**
 
 For more information on the contents of these files, see the [ScPCA portal docs section on single cell gene expression file contents](https://scpca.readthedocs.io/en/latest/sce_file_contents.html).
@@ -478,9 +478,9 @@ results
     ├── library_id_unfiltered.rds
     ├── library_id_filtered.rds
     ├── library_id_processed.rds
-    ├── library_id_unfiltered_rna.hdf5
-    ├── library_id_filtered_rna.hdf5
-    ├── library_id_processed_rna.hdf5
+    ├── library_id_unfiltered_rna.h5ad
+    ├── library_id_filtered_rna.h5ad
+    ├── library_id_processed_rna.h5ad
     ├── library_id_metadata.json
     └── library_id_qc.html
 ```
@@ -687,15 +687,15 @@ nextflow run AlexsLemonade/scpca-nf/merge.nf \
 
 ### Output files
 
-The `merge.nf` workflow will output, for each specified project ID, an `.rds` file containing a merged `SingleCellExperiment` object, an `.hdf5` file containing a merged `AnnData` object, and a report which provides a brief summary of the types of libraries and their samples' diagnoses included in the merged object, as well as UMAP visualizations highlighting each library.
+The `merge.nf` workflow will output, for each specified project ID, an `.rds` file containing a merged `SingleCellExperiment` object, an `.h5ad` file containing a merged `AnnData` object, and a report which provides a brief summary of the types of libraries and their samples' diagnoses included in the merged object, as well as UMAP visualizations highlighting each library.
 
 These output files will follow this structure:
 
 ```
 merged
 └── <project_id>
     ├── <project_id>_merged.rds
-    ├── <project_id>_merged_rna.hdf5
+    ├── <project_id>_merged_rna.h5ad
     └── <project_id>_merged-summary-report.html
 ```
 
@@ -708,15 +708,15 @@ There are some additional considerations to be aware of for libraries which cont
 
 If any libraries in a merge group have ADT counts, these counts will also be merged and included in the final merged object.
 In the case of `SingleCellExperiment` objects, ADT counts will be provided as an alternative experiment called `"adt"` in same object.
-In the case of `AnnData`, a separate file will be exported with the extension `_adt.hdf5` that contains the merged ADT counts.
+In the case of `AnnData`, a separate file will be exported with the extension `_adt.h5ad` that contains the merged ADT counts.
 The output files will follow this structure if CITE-seq data is present:
 
 ```
 merged
 └── <project_id>
     ├── <project_id>_merged.rds
-    ├── <project_id>_merged_rna.hdf5
-    ├── <project_id>_merged_adt.hdf5
+    ├── <project_id>_merged_rna.h5ad
+    ├── <project_id>_merged_adt.h5ad
     └── <project_id>_merged-summary-report.html
 ```
 

diff --git a/merge.nf b/merge.nf
@@ -92,28 +92,28 @@ process export_anndata {
     input:
       tuple path(merged_sce_file), val(merge_group_id), val(has_adt)
     output:
-      tuple val(merge_group_id), path("${merge_group_id}_merged_*.hdf5")
+      tuple val(merge_group_id), path("${merge_group_id}_merged_*.h5ad")
     script:
-      rna_hdf5_file = "${merge_group_id}_merged_rna.hdf5"
-      feature_hdf5_file = "${merge_group_id}_merged_adt.hdf5"
+      rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
+      feature_h5ad_file = "${merge_group_id}_merged_adt.h5ad"
       """
       sce_to_anndata.R \
         --input_sce_file ${merged_sce_file} \
-        --output_rna_h5 ${rna_hdf5_file} \
-        --output_feature_h5 ${feature_hdf5_file} \
+        --output_rna_h5 ${rna_h5ad_file} \
+        --output_feature_h5 ${feature_h5ad_file} \
         --is_merged \
         ${has_adt ? "--feature_name adt" : ''}
 
       # move normalized counts to X in AnnData
-      move_counts_anndata.py --anndata_file ${rna_hdf5_file}
-      ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_hdf5_file}" : ''}
+      move_counts_anndata.py --anndata_file ${rna_h5ad_file}
+      ${has_adt ? "move_counts_anndata.py --anndata_file ${feature_h5ad_file}" : ''}
       """
     stub:
-      rna_hdf5_file = "${merge_group_id}_merged_rna.hdf5"
-      feature_hdf5_file = "${merge_group_id}_merged_adt.hdf5"
+      rna_h5ad_file = "${merge_group_id}_merged_rna.h5ad"
+      feature_h5ad_file = "${merge_group_id}_merged_adt.h5ad"
       """
-      touch ${rna_hdf5_file}
-      ${has_adt ? "touch ${feature_hdf5_file}" : ''}
+      touch ${rna_h5ad_file}
+      ${has_adt ? "touch ${feature_h5ad_file}" : ''}
       """
 }
 

diff --git a/modules/classify-celltypes.nf b/modules/classify-celltypes.nf
@@ -67,11 +67,11 @@ process classify_cellassign {
     # Convert SCE to AnnData
     sce_to_anndata.R \
         --input_sce_file "${processed_rds}" \
-        --output_rna_h5 "processed.hdf5"
+        --output_rna_h5 "processed.h5ad"
 
     # Run CellAssign
     predict_cellassign.py \
-      --input_hdf5_file "processed.hdf5" \
+      --anndata_file "processed.h5ad" \
       --output_predictions "${cellassign_dir}/cellassign_predictions.tsv" \
       --reference "${cellassign_reference_file}" \
       --seed ${params.seed} \

diff --git a/modules/export-anndata.nf b/modules/export-anndata.nf
@@ -8,36 +8,36 @@ process export_anndata {
     input:
       tuple val(meta), path(sce_file), val(file_type)
     output:
-      tuple val(meta), path("${meta.library_id}_${file_type}_*.hdf5"), val(file_type)
+      tuple val(meta), path("${meta.library_id}_${file_type}_*.h5ad"), val(file_type)
     script:
-      rna_hdf5_file = "${meta.library_id}_${file_type}_rna.hdf5"
-      feature_hdf5_file = "${meta.library_id}_${file_type}_${meta.feature_type}.hdf5"
+      rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
+      feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
       feature_present = meta.feature_type in ["adt"]
       """
       sce_to_anndata.R \
         --input_sce_file ${sce_file} \
-        --output_rna_h5 ${rna_hdf5_file} \
-        --output_feature_h5 ${feature_hdf5_file} \
+        --output_rna_h5 ${rna_h5ad_file} \
+        --output_feature_h5 ${feature_h5ad_file} \
         ${feature_present ? "--feature_name ${meta.feature_type}" : ''} \
         ${file_type != "processed" ? "--compress_output" : ''}
 
       # move any normalized counts to X in AnnData
       if [ "${file_type}" = "processed" ]; then
-        move_counts_anndata.py --anndata_file ${rna_hdf5_file}
+        move_counts_anndata.py --anndata_file ${rna_h5ad_file}
         # move counts in feature data, if the file exists
-        if [ -f "${feature_hdf5_file}" ]; then
-          move_counts_anndata.py --anndata_file ${feature_hdf5_file}
+        if [ -f "${feature_h5ad_file}" ]; then
+          move_counts_anndata.py --anndata_file ${feature_h5ad_file}
         fi
       fi
 
       """
     stub:
-      rna_hdf5_file = "${meta.library_id}_${file_type}_rna.hdf5"
-      feature_hdf5_file = "${meta.library_id}_${file_type}_${meta.feature_type}.hdf5"
+      rna_h5ad_file = "${meta.library_id}_${file_type}_rna.h5ad"
+      feature_h5ad_file = "${meta.library_id}_${file_type}_${meta.feature_type}.h5ad"
       feature_present = meta.feature_type in ["adt"]
       """
-      touch ${rna_hdf5_file}
-      ${feature_present ? "touch ${feature_hdf5_file}" : ''}
+      touch ${rna_h5ad_file}
+      ${feature_present ? "touch ${feature_h5ad_file}" : ''}
       """
 }
 
@@ -69,16 +69,16 @@ workflow sce_to_anndata {
 
       // combine all anndata files by library id
       anndata_ch = export_anndata.out
-        .map{ meta, hdf5_files, file_type -> tuple(
+        .map{ meta, h5ad_files, file_type -> tuple(
           meta.library_id, // pull out library id for grouping
           meta,
-          hdf5_files // either rna.hdf5 or [ rna.hdf5, feature.hdf5 ]
+          h5ad_files // either rna.h5ad or [ rna.h5ad, feature.h5ad ]
         )}
         // group by library id result is
-        // [library id, [meta, meta, meta], [hdf5 files]]
+        // [library id, [meta, meta, meta], [h5ad files]]
         .groupTuple(by: 0, size: 3, remainder: true)
-        // pull out just 1 meta object and hdf5 files
-        // [meta, [hdf5 files]]
+        // pull out just 1 meta object and h5ad files
+        // [meta, [h5ad files]]
         .map{ [it[1][0]] +  it[2] }
 
     emit: anndata_ch