diff --git a/bin/merge_sces.R b/bin/merge_sces.R index a436f6b3..acd7ffc0 100755 --- a/bin/merge_sces.R +++ b/bin/merge_sces.R @@ -28,6 +28,12 @@ option_list <- list( help = "number of high variance genes to use for dimension reduction; the default is n_hvg = 2000" ), + make_option( + opt_str = c("--include_alt_exp"), + action = "store_true", + default = FALSE, + help = "Keep any altExp present in the merged object." + ), make_option( opt_str = c("-t", "--threads"), type = "integer", @@ -108,7 +114,8 @@ merged_sce <- scpcaTools::merge_sce_list( sce_list, batch_column = "library_id", preserve_rowdata_cols = "gene_symbol", - cell_id_column = "cell_id" + cell_id_column = "cell_id", + include_alt_exp = include_alt_exp ) diff --git a/merge.nf b/merge.nf index 6b7024c1..7b66f897 100644 --- a/merge.nf +++ b/merge.nf @@ -31,21 +31,22 @@ if(param_error){ process merge_sce { container params.SCPCATOOLS_CONTAINER label 'mem_16' - publishDir "${params.checkpoints_dir}/merged" + publishDir "${params.results_dir}/merged/${project_id}" input: - tuple val(project_id), val(library_ids), path(scpca_nf_file) + tuple val(merge_group_id), val(has_adt), val(library_ids), path(scpca_nf_file) output: - tuple val(project_id), path(merged_sce_file) + tuple val(merge_group_id), val(has_adt), path(merged_sce_file) script: input_library_ids = library_ids.join(',') input_sces = scpca_nf_file.join(',') - merged_sce_file = "${project_id}_merged.rds" + merged_sce_file = "${merge_group_id}_merged.rds" """ merge_sces.R \ --input_library_ids "${input_library_ids}" \ --input_sce_files "${input_sces}" \ --output_sce_file "${merged_sce_file}" \ --n_hvg ${params.num_hvg} \ + "${has_adt ? "--include_alt_exp" : ''} \ --threads ${task.cpus} """ stub: @@ -62,23 +63,23 @@ process merge_report { publishDir "${params.results_dir}/merged/${merge_group}" label 'mem_16' input: - tuple val(merge_group), path(merged_sce_file) + tuple val(merge_group_id), path(merged_sce_file) path(report_template) output: path(merge_report) script: - merge_report = "${merge_group}_summary_report.html" + merge_report = "${merge_group_id}_summary_report.html" """ Rscript -e "rmarkdown::render( \ '${report_template}', \ output_file = '${merge_report}', \ - params = list(merge_group = '${merge_group}', \ + params = list(merge_group = '${merge_group_id}', \ merged_sce = '${merged_sce_file}', \ batch_column = 'library_id') \ )" """ stub: - merge_report = "${merge_group}_summary_report.html" + merge_report = "${merge_group_id}_summary_report.html" """ touch ${merge_report} """ @@ -89,12 +90,20 @@ workflow { // grab project ids to run project_ids = params.project?.tokenize(',') ?: [] - // read in run metafile, filter to projects of interest, and group by project - grouped_libraries_ch = Channel.fromPath(params.run_metafile) + // read in run metafile and filter to projects of interest + libraries_ch = Channel.fromPath(params.run_metafile) .splitCsv(header: true, sep: '\t') // filter to only include specified project ids .filter{it.scpca_project_id in project_ids} - // only include single-cell/single-nuclei which already contain processed altexps, and ensure we don't try to merge libraries from spatial or bulk data + + // get all projects that contain at least one library with CITEseq + adt_projects = libraries_ch + .filter{it.technology.startsWith('CITEseq')} + .collect{it.scpca_project_id} + .unique() + + grouped_libraries_ch = libraries_ch + // only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data .filter{it.seq_unit in ['cell', 'nucleus']} // create tuple of [project id, library_id, processed_sce_file] .map{[ @@ -104,11 +113,17 @@ workflow { ]} // only include libraries that have been processed through scpca-nf .filter{file(it[2]).exists()} - // make sure we don't have any duplicates of the same library ID hanging around - // this shouldn't be the case since we removed CITE-seq and cell-hashing + // only one row per library ID, this removes all the duplicates that may be present due to CITE/hashing .unique() // group tuple by project id: [project_id, [library_id1, library_id2, ...], [sce_file1, sce_file2, ...]] .groupTuple(by: 0) + // add in boolean for if project contains samples with adt + .map{project_id, library_id_list, sce_file_list -> tuple( + project_id, + project_id in adt_projects, // determines if altExp should be included in the merged object + library_id_list, + sce_file_list + )} merge_sce(grouped_libraries_ch)