Merge pull request #801 from AlexsLemonade/jashapiro/filter-merge-count

Add minimum number of cells (3) for merge
AlexsLemonade · Nov 7, 2024 · ba74034 · ba74034
2 parents cc50b09 + cb51115
commit ba74034
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 6 deletions.
diff --git a/bin/merge_sces.R b/bin/merge_sces.R
@@ -132,6 +132,19 @@ read_trim_sce <- function(sce_file) {
 # get list of sces
 sce_list <- purrr::map(input_sce_files, read_trim_sce)
 
+# filter out libraries with fewer than 3 cells (causes errors with PCA)
+n_cells <- sce_list |> purrr::map_int(ncol)
+included_libs <- names(sce_list)[which(n_cells >= 3)]
+lib_diff <- setdiff(names(sce_list), included_libs)
+if (length(lib_diff) > 0) {
+  message(
+    "The following libraries have fewer than 3 cells and will be excluded from the merged object: ",
+    paste(lib_diff, collapse = ", ")
+  )
+}
+sce_list <- sce_list[included_libs]
+
+
 # Add cell type annotation columns where needed  -------------------------------
 
 # check for present cell type annotations

diff --git a/lib/Utils.groovy b/lib/Utils.groovy
@@ -43,6 +43,10 @@ class Utils {
    * @return A value from the metadata
    */
   static def getMetaVal(file, key) {
+    if (!file.exists()) {
+      return(null)
+    }
+
     def obj = new JsonSlurper().parse(file)
     def value = obj[key]
 

diff --git a/merge.nf b/merge.nf
@@ -199,22 +199,32 @@ workflow {
     filtered_libraries_ch.single_sample
       .map{[
         it.library_id,
-        file("${params.results_dir}/${it.project_id}/${it.sample_id}/${it.library_id}_processed.rds")
+        file("${params.results_dir}/${it.project_id}/${it.sample_id}/${it.library_id}_processed.rds"),
+        file("${params.results_dir}/${it.project_id}/${it.sample_id}/${it.library_id}_metadata.json")
       ]}
-    .filter{!(it[1].exists() && it[1].size() > 0)}
     .subscribe{
-      log.warn("Processed files do not exist for ${it[0]}. This library will not be included in the merged object.")
+      if(!(it[1].exists() && it[1].size() > 0)){
+        log.warn("Processed files do not exist for ${it[0]}. This library will not be included in the merged object.")
+      }
+      else if(!(it[2].exists() && it[2].size() > 0)){
+        log.warn("Metadata file does not exist for ${it[0]}. This library will not be included in the merged object.")
+      }
+      else if (Utils.getMetaVal(it[2], "processed_cells") < 3){
+        log.warn("Library ${it[0]} has fewer than 3 cells. This library will not be included in the merged object.")
+      }
     }
 
     grouped_libraries_ch = filtered_libraries_ch.single_sample
       // create tuple of [project id, library_id, processed_sce_file]
       .map{[
         it.project_id,
         it.library_id,
-        file("${params.results_dir}/${it.project_id}/${it.sample_id}/${it.library_id}_processed.rds")
+        file("${params.results_dir}/${it.project_id}/${it.sample_id}/${it.library_id}_processed.rds"),
+        file("${params.results_dir}/${it.project_id}/${it.sample_id}/${it.library_id}_metadata.json")
       ]}
-      // only include libraries that have been processed through scpca-nf and aren't empty
-      .filter{it[2].exists() && it[2].size() > 0}
+      // only include libraries that have been processed through scpca-nf and have at least 3 cells
+      .filter{it[2].exists() && it[2].size() > 0 && Utils.getMetaVal(it[3], "processed_cells") >= 3}
+      .map{it[0..2]} // remove metadata file from tuple
       // only one row per library ID, this removes all the duplicates that may be present due to CITE/hashing
       .unique()
       // group tuple by project id: [project_id, [library_id1, library_id2, ...], [sce_file1, sce_file2, ...]]