Merge pull request #501 from AlexsLemonade/allyhawkins/v0.5.5

Add in changes for v0.6.0 to main
AlexsLemonade · Oct 11, 2023 · 5f6a49e · 5f6a49e
2 parents 67e175b + 6a7301d
commit 5f6a49e
Show file tree

Hide file tree

Showing 69 changed files with 11,323 additions and 889 deletions.
diff --git a/.github/workflows/nextflow-stub-check.yaml b/.github/workflows/nextflow-stub-check.yaml
@@ -24,12 +24,13 @@ jobs:
         uses: docker://nextflow/nextflow:21.10.6
         with:
           args: nextflow -log checkpoint-run.log run main.nf -stub -profile stub -ansi-log false
-      
+
       - name: Join log files
+        if: ${{ !cancelled() }}
         run: cat stub-run.log checkpoint-run.log > nextflow-runs.log
-      
+
       - name: Upload nextflow log
-        if: ${{ always() }}
+        if: ${{ !cancelled() }}
         uses:  actions/upload-artifact@v3
         with:
           name: nextflow-log

diff --git a/.github/workflows/spell-check.yml b/.github/workflows/spell-check.yml
@@ -0,0 +1,43 @@
+
+name: Spell check R Markdown and Markdown files
+
+# Controls when the action will run.
+# Pull requests to development and main only.
+on:
+  pull_request:
+    branches:
+      - development
+      - main
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a single job called "spell check"
+  spell-check:
+    runs-on: ubuntu-latest
+    container:
+      image: rocker/tidyverse:4.2.3
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install packages
+        run: Rscript --vanilla -e "install.packages(c('spelling'), repos = c(CRAN = '$CRAN'))"
+
+      - name: Run spell check
+        id: spell_check_run
+        run: |
+          results=$(Rscript --vanilla "scripts/spell-check.R")
+          echo "sp_chk_results=$results" >> $GITHUB_OUTPUT
+          cat spell_check_errors.tsv
+
+      - name: Archive spelling errors
+        uses: actions/upload-artifact@v3
+        with:
+          name: spell-check-results
+          path: spell_check_errors.tsv
+
+      # If there are too many spelling errors, this will stop the workflow
+      - name: Check spell check results - fail if too many errors
+        if: ${{ steps.spell_check_run.outputs.sp_chk_results > 0 }}
+        run: exit 1
diff --git a/.gitignore b/.gitignore
@@ -11,3 +11,6 @@ scpca-references/
 # ignore template htmls
 qc_report.html
 *_qc.html
+
+# ignore hidden `DS_Store`
+.DS_Store
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -25,7 +25,19 @@ When the changes in `development` merit a new release, a pull request will be fi
 All Nextflow processes should include a [`stub` block](https://www.nextflow.io/docs/latest/process.html#stub) with a minimal script that can be run quickly to produce files in the expected output locations.
 At this stage this is purely used to allow for testing of the main workflow logic rather than the internal logic of each process.
 
-The [`test/stub-metadata.tsv`](test/stub-metadata.tsv) file is used to define input libraries that will be used for testing.
-Any additions to the overall workflow that will allow processing of a new library type should be added into `test/stub-metadata.tsv`, along with the appropriate input files (usually empty files with the expected names) for that library type in the `test/runs/` directory.
+The [`test/stub-run-metadata.tsv`](test/stub-run-metadata.tsv) file is used to define input libraries that will be used for testing.
+Each `scpca_sample_id` value in the `stub-run-metadata.tsv` file should have a corresponding entry in [`test/stub-sample-metadata.tsv`](test/stub-sample-metadata.tsv) (which can be filled with `NA` values).
+Any additions to the overall workflow that will allow processing of a new library type should include adding new example data.
+This will involve adding rows to `test/stub-run-metadata.tsv` and `test/stub-sample-metadata.tsv`, along with the appropriate input files (usually empty files with the expected names) for each library in the `test/runs/` directory.
 If a new reference type is needed, that should be defined in the [`test/stub-refs.json`](test/stub-refs.json) file.
 
+## Code style
+
+While there is not necessarily an established code style for nextflow code, we try to keep code neat and readable.
+Line length should generally be kept under 100 characters, and indentation should be consistent.
+
+For R code, we try to follow [`tidyverse` style conventions](https://style.tidyverse.org), and encourage the use of the [`styler`](https://styler.r-lib.org/) package to ensure that code is formatted consistently.
+
+For python code, we encourage the use of the [`black` code formatter](https://black.readthedocs.io/en/stable/) to ensure consistent formatting.
+The `black` package can be installed with `pip install black`, and can be run on a file with `black <filename>`.
+Alternatively, if you use [Visual Studio Code](https://code.visualstudio.com), you can install the [`black` extension](https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter).
diff --git a/README.md b/README.md
@@ -4,10 +4,11 @@ This repository holds a [Nextflow](https://www.nextflow.io) workflow (`scpca-nf`
 All dependencies for the workflow outside of the Nextflow workflow engine itself are handled automatically; setup generally requires only organizing the input files and configuring Nextflow for your computing environment.
 Nextflow will also handle parallelizing sample processing as allowed by your environment, minimizing total run time.
 
-The workflow processes fastq files from single-cell and single-nuclei RNA-seq samples using [alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) to create gene by cell matrices as [`SingleCellExperiment` objects](https://www.bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html).
+The workflow processes fastq files from single-cell and single-nuclei RNA-seq samples using [alevin-fry](https://alevin-fry.readthedocs.io/en/latest/) to create gene by cell matrices.
+The workflow outputs gene expression data in two formats: as [`SingleCellExperiment` objects](https://www.bioconductor.org/packages/release/bioc/html/SingleCellExperiment.html) and as [`AnnData` objects](https://anndata.readthedocs.io/en/latest/).
 Reads from samples are aligned using selective alignment, to an index with transcripts corresponding to spliced cDNA and to intronic regions, denoted by alevin-fry as `splici`.
 These matrices are filtered and additional processing is performed to calculate quality control statistics, create reduced-dimension transformations, and create output reports.
-`scpca-nf` can also process CITE-seq, bulk RNA-seq, and spatial transcriptomics samples.
+`scpca-nf` can also process libraries with ADT tags (e.g., CITE-seq), multiplexed libraries (e.g., cell hashing), bulk RNA-seq, and spatial transcriptomics samples.
 
 For more information on the contents of the output files and the processing of all modalities, please see the [ScPCA Portal docs](https://scpca.readthedocs.io/en/latest/).
 
@@ -25,8 +26,9 @@ For all other users, `scpca-nf` can be set up for your computing environment wit
 To run `scpca-nf` on your own samples, you will need to complete the following steps:
 
 1. [Organize your files](external-instructions.md#file-organization) so that each folder contains fastq files relevant to a single sequencing run.
-2. [Prepare a metadata file](external-instructions.md#prepare-the-metadata-file) with one row per library containing all information needed to process your samples.
-3. Set up a [configuration file](external-instructions.md#configuration-files), including the [definition of a profile](external-instructions.md#setting-up-a-profile-in-the-configuration-file), dictating where nextflow should execute the workflow.
+2. [Prepare a run metadata file](external-instructions.md#prepare-the-run-metadata-file) with one row per library containing all information needed to process your samples.
+3. [Prepare a sample metadata file](external-instructions.md#prepare-the-sample-metadata-file) with one row per sample containing any relevant metadata about each sample (e.g., diagnosis, age, sex, cell line).
+4. Set up a [configuration file](external-instructions.md#configuration-files), including the [definition of a profile](external-instructions.md#setting-up-a-profile-in-the-configuration-file), dictating where nextflow should execute the workflow.
 
 You may also [test your configuration file using example data](examples/README.md).
 

diff --git a/add-celltypes.nf b/add-celltypes.nf
@@ -11,8 +11,8 @@ if (!file(params.run_metafile).exists()) {
   param_error = true
 }
 
-if (!file(params.celltype_refs_metafile).exists()) {
-  log.error("The 'celltype_refs_metafile' file '${params.celltype_refs_metafile}' can not be found.")
+if (!file(params.celltype_project_metafile).exists()) {
+  log.error("The 'celltype_project_metafile' file '${params.celltype_project_metafile}' can not be found.")
   param_error = true
 }
 
@@ -52,10 +52,12 @@ workflow {
              || (it.submitter == params.project)
              || (it.project_id == params.project)
             }
-    // tuple of meta, processed rds file to use as input to cell type annotation
-    .map{meta -> tuple(meta,
-                       file("${params.results_dir}/${meta.project_id}/${meta.sample_id}/${meta.library_id}_processed.rds")
-                       )}
+      // tuple of meta, processed RDS file, processed hdf5 file to use as input to classifying celltypes
+      .map{meta -> tuple(meta,
+                        file("${params.results_dir}/${meta.project_id}/${meta.sample_id}/${meta.library_id}_processed.rds"),
+                        file("${params.results_dir}/${meta.project_id}/${meta.sample_id}/${meta.library_id}_processed_rna.hdf5")
+                        )}
 
     annotate_celltypes(processed_sce_ch)
+
 }
diff --git a/bin/add_demux_sce.R b/bin/add_demux_sce.R
@@ -58,30 +58,30 @@ opt <- parse_args(OptionParser(option_list = option_list))
 
 
 # check that unfiltered file file exists
-if(!file.exists(opt$sce_file)){
+if (!file.exists(opt$sce_file)) {
   stop("Can't find input SCE file")
 }
 
 # check that output file name ends in .rds
-if(!(stringr::str_ends(opt$output_sce_file, ".rds"))){
+if (!(stringr::str_ends(opt$output_sce_file, ".rds"))) {
   stop("output file name must end in .rds")
 }
 
 # check for donor_ids file in vireo_dir
-if(!is.null(opt$vireo_dir)){
+if (!is.null(opt$vireo_dir)) {
   vireo_file <- file.path(opt$vireo_dir, "donor_ids.tsv")
-  if(!file.exists(vireo_file)){
+  if (!file.exists(vireo_file)) {
     stop("Can't find donor_ids.tsv file in vireo directory")
   }
 }
 
-if(is.null(opt$cellhash_pool_file)){
+if (is.null(opt$cellhash_pool_file)) {
   cellhash_df <- NULL
 } else {
-  if(!file.exists(opt$cellhash_pool_file)){
+  if (!file.exists(opt$cellhash_pool_file)) {
     stop("Can't find cellhash_pool_file")
   }
-  if(is.null(opt$library_id)){
+  if (is.null(opt$library_id)) {
     stop("Must specify library_id with cellhash_pool_file")
   }
   cellhash_df <- readr::read_tsv(opt$cellhash_pool_file) |>
@@ -93,35 +93,35 @@ if(is.null(opt$cellhash_pool_file)){
 sce <- readRDS(opt$sce_file)
 
 # check for cellhash altExp if we will use it
-if( opt$hash_demux || opt$seurat_demux ){
-  if(!"cellhash" %in% altExpNames(sce)){
+if (opt$hash_demux || opt$seurat_demux) {
+  if (!"cellhash" %in% altExpNames(sce)) {
     stop("Can't process cellhash demulitplexing without a 'cellhash' altExp")
   }
   # add cellhash sample data to SCE if present
-  if(!is.null(cellhash_df)){
+  if (!is.null(cellhash_df)) {
     sce <- scpcaTools::add_cellhash_ids(sce, cellhash_df, remove_unlabeled = TRUE)
   }
 }
 
 cellhash_ids <- rownames(altExp(sce))
 
 # only perform demultiplexing if more than one HTO is detected
-if(length(cellhash_ids) > 1) {
+if (length(cellhash_ids) > 1) {
   # add HashedDrops results
-  if(opt$hash_demux){
+  if (opt$hash_demux) {
     sce <- scpcaTools::add_demux_hashedDrops(sce)
   }
-  
+
   # add Seurat results
-  if(opt$seurat_demux){
+  if (opt$seurat_demux) {
     sce <- scpcaTools::add_demux_seurat(sce)
   }
-  
+
   # add vireo results
-  if(!is.null(opt$vireo_dir)){
+  if (!is.null(opt$vireo_dir)) {
     vireo_table <- readr::read_tsv(vireo_file)
     sce <- scpcaTools::add_demux_vireo(sce, vireo_table)
-  } 
+  }
 }
 
 # write filtered sce to output

diff --git a/bin/add_submitter_annotations.R b/bin/add_submitter_annotations.R
@@ -0,0 +1,80 @@
+#!/usr/bin/env Rscript
+
+# This script adds submitter annotations, if provided, to an SCE object's colData.
+
+# import libraries
+suppressPackageStartupMessages({
+  library(optparse)
+  library(SingleCellExperiment)
+})
+# set up arguments
+option_list <- list(
+  make_option(
+    opt_str = c("-f", "--sce_file"),
+    type = "character",
+    help = "path to SingleCellExperiment file to update. Must end in .rds"
+  ),
+  make_option(
+    opt_str = c("--library_id"),
+    type = "character",
+    help = "library id"
+  ),
+  make_option(
+    opt_str = c("--submitter_cell_types_file"),
+    type = "character",
+    help = "path to tsv file containing submitter-provided cell type annotations"
+  )
+)
+
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# check that output file name ends in .rds
+if (!(stringr::str_ends(opt$sce_file, ".rds"))) {
+  stop("SingleCellExperiment file name must end in .rds")
+}
+
+
+# check that submitter cell types file exists
+if (!file.exists(opt$submitter_cell_types_file)) {
+  stop("submitter cell type annotations file not found.")
+}
+
+
+# Read in sce
+sce <- readr::read_rds(opt$sce_file)
+
+# Read in celltypes
+submitter_cell_types_df <- readr::read_tsv(opt$submitter_cell_types_file) |>
+  # filter to relevant information
+  dplyr::filter(scpca_library_id == opt$library_id) |>
+  dplyr::select(
+    barcodes = cell_barcode,
+    submitter_celltype_annotation = cell_type_assignment
+  ) |>
+  # in the event of NA values, change to "Unclassified cell"
+  tidyr::replace_na(
+    list(submitter_celltype_annotation = "Unclassified cell")
+  ) |>
+  # join with colData
+  dplyr::right_join(
+    colData(sce) |>
+      as.data.frame()
+  )
+
+# Check rows before sending back into the SCE object
+if (nrow(submitter_cell_types_df) != ncol(sce)) {
+  stop("Could not add submitter annotations to SCE object. There should only be one annotation per cell.")
+}
+
+# Rejoin with colData, making sure we keep rownames
+colData(sce) <- DataFrame(
+  submitter_cell_types_df, 
+  row.names = colData(sce)$barcodes
+)
+
+# Indicate that we have submitter celltypes in metadata, 
+#  saving in same spot as for actual celltyping
+metadata(sce)$celltype_methods <- c(metadata(sce)$celltype_methods, "submitter")
+
+# Write SCE back to file
+readr::write_rds(sce, opt$sce_file, compress = "gz")