AlexsLemonade · allyhawkins · Dec 11, 2023 · Dec 7, 2023 · Dec 7, 2023 · Dec 7, 2023
diff --git a/bin/merge_sces.R b/bin/merge_sces.R
@@ -28,6 +28,12 @@ option_list <- list(
     help = "number of high variance genes to use for dimension reduction;
             the default is n_hvg = 2000"
   ),
+  make_option(
+    opt_str = c("--include_alt_exp"),
+    action = "store_true",
+    default = FALSE,
+    help = "Keep any altExp present in the merged object."
+  ),
   make_option(
     opt_str = c("-t", "--threads"),
     type = "integer",
@@ -108,7 +114,8 @@ merged_sce <- scpcaTools::merge_sce_list(
   sce_list,
   batch_column = "library_id",
   preserve_rowdata_cols = "gene_symbol",
-  cell_id_column = "cell_id"
+  cell_id_column = "cell_id",
+  include_alt_exp = include_alt_exp
 )
 
 

diff --git a/merge.nf b/merge.nf
@@ -31,21 +31,22 @@ if(param_error){
 process merge_sce {
   container params.SCPCATOOLS_CONTAINER
   label 'mem_16'
-  publishDir "${params.checkpoints_dir}/merged"
+  publishDir "${params.results_dir}/merged/${project_id}"
   input:
-    tuple val(project_id), val(library_ids), path(scpca_nf_file)
+    tuple val(merge_group_id), val(has_adt), val(library_ids), path(scpca_nf_file)
   output:
-    tuple val(project_id), path(merged_sce_file)
+    tuple val(merge_group_id), val(has_adt), path(merged_sce_file)
   script:
     input_library_ids = library_ids.join(',')
     input_sces = scpca_nf_file.join(',')
-    merged_sce_file = "${project_id}_merged.rds"
+    merged_sce_file = "${merge_group_id}_merged.rds"
     """
     merge_sces.R \
       --input_library_ids "${input_library_ids}" \
       --input_sce_files "${input_sces}" \
       --output_sce_file "${merged_sce_file}" \
       --n_hvg ${params.num_hvg} \
+      "${has_adt ? "--include_alt_exp" : ''} \
       --threads ${task.cpus}
     """
   stub:
@@ -62,23 +63,23 @@ process merge_report {
   publishDir "${params.results_dir}/merged/${merge_group}"
   label 'mem_16'
   input:
-    tuple val(merge_group), path(merged_sce_file)
+    tuple val(merge_group_id), path(merged_sce_file)
     path(report_template)
   output:
     path(merge_report)
   script:
-    merge_report = "${merge_group}_summary_report.html"
+    merge_report = "${merge_group_id}_summary_report.html"
     """
     Rscript -e "rmarkdown::render( \
       '${report_template}', \
       output_file = '${merge_report}', \
-      params = list(merge_group = '${merge_group}', \
+      params = list(merge_group = '${merge_group_id}', \
                     merged_sce = '${merged_sce_file}', \
                     batch_column = 'library_id') \
       )"
     """
   stub:
-    merge_report = "${merge_group}_summary_report.html"
+    merge_report = "${merge_group_id}_summary_report.html"
     """
     touch ${merge_report}
     """
@@ -89,12 +90,20 @@ workflow {
     // grab project ids to run
     project_ids = params.project?.tokenize(',') ?: []
 
-    // read in run metafile, filter to projects of interest, and group by project
-    grouped_libraries_ch = Channel.fromPath(params.run_metafile)
+    // read in run metafile and filter to projects of interest
+    libraries_ch = Channel.fromPath(params.run_metafile)
       .splitCsv(header: true, sep: '\t')
       // filter to only include specified project ids
       .filter{it.scpca_project_id in project_ids}
-      // only include single-cell/single-nuclei which already contain processed altexps, and ensure we don't try to merge libraries from spatial or bulk data
+
+    // get all projects that contain at least one library with CITEseq
+    adt_projects = libraries_ch
+      .filter{it.technology.startsWith('CITEseq')}
+      .collect{it.scpca_project_id}
+      .unique()
+
+    grouped_libraries_ch = libraries_ch
+      // only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data
       .filter{it.seq_unit in ['cell', 'nucleus']}
       // create tuple of [project id, library_id, processed_sce_file]
       .map{[
@@ -104,11 +113,17 @@ workflow {
       ]}
       // only include libraries that have been processed through scpca-nf
       .filter{file(it[2]).exists()}
-      // make sure we don't have any duplicates of the same library ID hanging around
-      // this shouldn't be the case since we removed CITE-seq and cell-hashing
+      // only one row per library ID, this removes all the duplicates that may be present due to CITE/hashing
       .unique()
       // group tuple by project id: [project_id, [library_id1, library_id2, ...], [sce_file1, sce_file2, ...]]
       .groupTuple(by: 0)
+      // add in boolean for if project contains samples with adt
+      .map{project_id, library_id_list, sce_file_list -> tuple(
+        project_id,
+        project_id in adt_projects, // determines if altExp should be included in the merged object
+        library_id_list,
+        sce_file_list
+      )}
 
     merge_sce(grouped_libraries_ch)