Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Account for feature data when merging objects #610

Merged
merged 5 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion bin/merge_sces.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ option_list <- list(
help = "number of high variance genes to use for dimension reduction;
the default is n_hvg = 2000"
),
make_option(
opt_str = c("--include_alt_exp"),
action = "store_true",
default = FALSE,
help = "Keep any altExp present in the merged object."
),
make_option(
opt_str = c("-t", "--threads"),
type = "integer",
Expand Down Expand Up @@ -108,7 +114,8 @@ merged_sce <- scpcaTools::merge_sce_list(
sce_list,
batch_column = "library_id",
preserve_rowdata_cols = "gene_symbol",
cell_id_column = "cell_id"
cell_id_column = "cell_id",
include_alt_exp = include_alt_exp
)


Expand Down
41 changes: 28 additions & 13 deletions merge.nf
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,22 @@ if(param_error){
process merge_sce {
container params.SCPCATOOLS_CONTAINER
label 'mem_16'
publishDir "${params.checkpoints_dir}/merged"
publishDir "${params.results_dir}/merged/${project_id}"
input:
tuple val(project_id), val(library_ids), path(scpca_nf_file)
tuple val(merge_group_id), val(has_adt), val(library_ids), path(scpca_nf_file)
output:
tuple val(project_id), path(merged_sce_file)
tuple val(merge_group_id), val(has_adt), path(merged_sce_file)
script:
input_library_ids = library_ids.join(',')
input_sces = scpca_nf_file.join(',')
merged_sce_file = "${project_id}_merged.rds"
merged_sce_file = "${merge_group_id}_merged.rds"
"""
merge_sces.R \
--input_library_ids "${input_library_ids}" \
--input_sce_files "${input_sces}" \
--output_sce_file "${merged_sce_file}" \
--n_hvg ${params.num_hvg} \
"${has_adt ? "--include_alt_exp" : ''} \
--threads ${task.cpus}
"""
stub:
Expand All @@ -62,23 +63,23 @@ process merge_report {
publishDir "${params.results_dir}/merged/${merge_group}"
label 'mem_16'
input:
tuple val(merge_group), path(merged_sce_file)
tuple val(merge_group_id), path(merged_sce_file)
path(report_template)
output:
path(merge_report)
script:
merge_report = "${merge_group}_summary_report.html"
merge_report = "${merge_group_id}_summary_report.html"
"""
Rscript -e "rmarkdown::render( \
'${report_template}', \
output_file = '${merge_report}', \
params = list(merge_group = '${merge_group}', \
params = list(merge_group = '${merge_group_id}', \
merged_sce = '${merged_sce_file}', \
batch_column = 'library_id') \
)"
"""
stub:
merge_report = "${merge_group}_summary_report.html"
merge_report = "${merge_group_id}_summary_report.html"
"""
touch ${merge_report}
"""
Expand All @@ -89,12 +90,20 @@ workflow {
// grab project ids to run
project_ids = params.project?.tokenize(',') ?: []

// read in run metafile, filter to projects of interest, and group by project
grouped_libraries_ch = Channel.fromPath(params.run_metafile)
// read in run metafile and filter to projects of interest
libraries_ch = Channel.fromPath(params.run_metafile)
.splitCsv(header: true, sep: '\t')
// filter to only include specified project ids
.filter{it.scpca_project_id in project_ids}
// only include single-cell/single-nuclei which already contain processed altexps, and ensure we don't try to merge libraries from spatial or bulk data

// get all projects that contain at least one library with CITEseq
adt_projects = libraries_ch
.filter{it.technology.startsWith('CITEseq')}
.collect{it.scpca_project_id}
.unique()

grouped_libraries_ch = libraries_ch
// only include single-cell/single-nuclei which ensures we don't try to merge libraries from spatial or bulk data
.filter{it.seq_unit in ['cell', 'nucleus']}
// create tuple of [project id, library_id, processed_sce_file]
.map{[
Expand All @@ -104,11 +113,17 @@ workflow {
]}
// only include libraries that have been processed through scpca-nf
.filter{file(it[2]).exists()}
// make sure we don't have any duplicates of the same library ID hanging around
// this shouldn't be the case since we removed CITE-seq and cell-hashing
// only one row per library ID, this removes all the duplicates that may be present due to CITE/hashing
.unique()
// group tuple by project id: [project_id, [library_id1, library_id2, ...], [sce_file1, sce_file2, ...]]
.groupTuple(by: 0)
// add in boolean for if project contains samples with adt
.map{project_id, library_id_list, sce_file_list -> tuple(
project_id,
project_id in adt_projects, // determines if altExp should be included in the merged object
library_id_list,
sce_file_list
)}

merge_sce(grouped_libraries_ch)

Expand Down