From 835e91688b5d0bd498f7d55959b68d19beca56d8 Mon Sep 17 00:00:00 2001 From: Shihab Dider Date: Fri, 18 Oct 2024 18:34:24 -0400 Subject: [PATCH] feat: update fusions process --- bin/Fusions.R | 62 +++++++++++++++++++------- conf/modules/fusions.config | 2 +- modules/local/fusions/main.nf | 6 ++- subworkflows/local/fusions/main.nf | 6 +-- tests/test_runs/chr21_test/params.json | 2 +- tests/test_runs/full_test/params.json | 2 +- workflows/nfcasereports.nf | 25 ++++++++--- 7 files changed, 75 insertions(+), 30 deletions(-) diff --git a/bin/Fusions.R b/bin/Fusions.R index 04c1b6d..f4edf3f 100644 --- a/bin/Fusions.R +++ b/bin/Fusions.R @@ -1,33 +1,39 @@ library(optparse) options(bitmapType='cairo') +message("R Libraries:") +print(.libPaths()) + options(error = function() {traceback(2); quit("no", 1)}) if (!exists('opt')) { option_list = list( make_option(c("-l", "--libdir"), type = "character", help = "libdir"), make_option(c("-i", "--id"), type = "character", help = "sample id"), - make_option(c("-g", "--gGraph"), type = "character", help = "an RDS file contains a gGraph or JaBbA graph with cn annotation on nodes and edges"), + make_option(c("-j", "--junctions"), type = "character", help = "an RDS file contains a gGraph or JaBbA graph with cn annotation on nodes and edges, or is a BND formatted SV VCF"), make_option(c("-r", "--gencode"), type = "character", default = "~/DB/GENCODE/gencode.v19.annotation.gtf.nochr.rds", help = "an RDS or GTF file of GENCODE"), + make_option(c("--genes"), type = "character", default = "/dev/null", help = "Gene list to filter junction breakends"), make_option(c("--do_only_canonical"), type = "logical", default = TRUE, help = "Use only canonical (longest per gene), protein-coding transcripts"), make_option(c("-o", "--outdir"), type = "character", default = './', help = "Directory to dump output into"), make_option(c("--cores"), type = "integer", default = 1L, help = "Number of cores") ) parseobj = OptionParser(option_list=option_list) + # print(parseobj) opt = parse_args(parseobj) + print(opt) saveRDS(opt, paste(opt$outdir, 'cmd.args.rds', sep = '/')) } suppressPackageStartupMessages({ library(gGnome) library(gUtils) - library(parallel) ## needed for mc.cores + library(parallel) ## needed for mc.cores library(rtracklayer) library(GenomeInfoDb) }) -## setDTthreads(10) +message("Loading GENCODE: ", opt$gencode) if (grepl('.rds$', opt$gencode)) { gencode = readRDS(as.character(opt$gencode)) @@ -48,30 +54,56 @@ if (opt$do_only_canonical) { )[, .(transcript_id = transcript_id[which.max(end-start+1)]), by = gene_name] ) gencode = gencode[ - gencode$transcript_id %in% canonical_transcripts$transcript_id + gencode$transcript_id %in% canonical_transcripts$transcript_id | (gencode$type == "gene" & gencode$gene_name %in% canonical_transcripts$gene_name) ] } - -if (grepl(".rds$", opt$gGraph)) { - gg_input = readRDS(opt$gGraph) +message("Loading SV input: ", opt$junctions) +if (grepl(".rds$", opt$junctions)) { + gg_input = readRDS(opt$junctions) if (!identical(c("gGraph", "R6"), class(gg_input)) && is.list(gg_input) && any(c("segstats", "purity", "ploidy") %in% names(gg_input))) gg_input = gGnome::gG(jabba = gg_input) # is jabba list object, convert -} else if (grepl(".vcf(.gz|.bz|.bz2|.xz)?$", opt$gGraph)) { - gg_input = gGnome::gG(junctions = opt$gGraph) + message("Refreshing gGraph input") + gg_input = gGnome::refresh(gg_input) +} else if (grepl(".vcf(.gz|.bz|.bz2|.xz)?$", opt$junctions)) { + gg_input = gGnome::gG(junctions = opt$junctions) } else { - stop("Unknown input file type: ", opt$gGraph) + stop("Unknown input file type: ", opt$junctions) } -fus = fusions(gg_input, gencode, verbose = TRUE, mc.cores = opt$cores) -## update events with sample id -if (length(fus)) -{ +if (!identical(opt$genes, "/dev/null") && file.exists(opt$genes)) { + message("Loading gene list to filter input SVs:") + cat(normalizePath(opt$genes)) + gene_list = readLines(opt$genes) + gencode_by_genes = split(gencode, gencode$gene_name) + genes_not_present = setdiff(gene_list, names(gencode_by_genes)) + if (length(genes_not_present)) { + message("The following genes are not present in gencode definition") + dput(genes_not_present) + } + genes_present = intersect(gene_list, names(gencode_by_genes)) + message("Number of unique genes to query in gencode: ", length(genes_present)) + message("\n") + message("Creating new gGraph from junctions overlapping with gene set") + alt_juncs = gg_input$edges$junctions[type == "ALT"] + which_juncs_in_genes = which(alt_juncs %^% unlist(gencode_by_genes[genes_present])) + message(length(which_juncs_in_genes), " ALT junctions in gene set") + gg_input = gGnome::gG(junctions=alt_juncs[which_juncs_in_genes]) + rm(gencode_by_genes) +} + +fus_lst = fusions(gg_input, gencode, verbose = TRUE, mc.cores = opt$cores, return.all.alt.edges = TRUE) +fus = fus_lst$fus +allaltedges.ann = fus_lst$allaltedges.ann + +if (length(fus)) { fus$set(id = opt$id) - fus$set(mincn = fus$eval(edge = min(cn, na.rm = TRUE))) + is_cn_attribute_present = NROW(fus$edges$dt$cn) + if (is_cn_attribute_present) fus$set(mincn = fus$eval(edge = min(cn, na.rm = TRUE))) } saveRDS(fus, paste0(opt$outdir, '/', 'fusions.rds')) +data.table::fwrite(allaltedges.ann, paste0(opt$outdir, '/', 'altedge.annotations.tsv'), sep = "\t") quit("no", 0) diff --git a/conf/modules/fusions.config b/conf/modules/fusions.config index e295ece..cbf9c3e 100644 --- a/conf/modules/fusions.config +++ b/conf/modules/fusions.config @@ -19,7 +19,7 @@ process { publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/fusions/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" + pattern: "*{.rds*,.command.*, .tsv}" ] } } diff --git a/modules/local/fusions/main.nf b/modules/local/fusions/main.nf index 9879e86..1efc890 100644 --- a/modules/local/fusions/main.nf +++ b/modules/local/fusions/main.nf @@ -9,11 +9,12 @@ process FUSIONS { 'mskilab/events:latest' }" input: - tuple val(meta), path(gGraph) + tuple val(meta), path(gGraph), path(sv_vcf), path(sv_vcf_tbi) path(gencode) output: tuple val(meta), path("*fusions.rds") , emit: fusions_output + tuple val(meta), path("*altedge.annotations.tsv") , emit: altedge_annotations path "versions.yml" , emit: versions when: @@ -23,6 +24,7 @@ process FUSIONS { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def id = "${meta.sample}" + def junctions_flag = sv_vcf ? "--junctions ${sv_vcf}" : "--junctions ${gGraph}" def VERSION = '0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ @@ -31,7 +33,7 @@ process FUSIONS { Rscript \$RSCRIPT_PATH \\ --id $id \\ - --gGraph $gGraph \\ + ${junctions_flag} \\ --gencode $gencode \\ --cores ${task.cpus} diff --git a/subworkflows/local/fusions/main.nf b/subworkflows/local/fusions/main.nf index 3aa055d..30d3914 100644 --- a/subworkflows/local/fusions/main.nf +++ b/subworkflows/local/fusions/main.nf @@ -4,19 +4,19 @@ gencode = WorkflowNfcasereports.create_file_channel(params.g workflow GGRAPH_FUSIONS { take: - input // required: format [val(meta), path(gGraph)] + input // required: format [val(meta), path(gGraph)] or [val(meta), []. path(sv_vcf), path(sv_vcf_tbi)] main: - versions = Channel.empty() - fusions_output = Channel.empty() FUSIONS(input, gencode) fusions_output = FUSIONS.out.fusions_output + altedge_annotations = FUSIONS.out.altedge_annotations versions = FUSIONS.out.versions emit: fusions_output + altedge_annotations versions } diff --git a/tests/test_runs/chr21_test/params.json b/tests/test_runs/chr21_test/params.json index 03dd3d5..9d8ac17 100644 --- a/tests/test_runs/chr21_test/params.json +++ b/tests/test_runs/chr21_test/params.json @@ -4,7 +4,7 @@ "fasta": "/gpfs/commons/home/sdider/DB/GATK/human_g1k_v37_decoy.fasta", "fasta_fai": "/gpfs/commons/home/sdider/DB/GATK/human_g1k_v37_decoy.fasta.fai", "bwa": "/gpfs/commons/home/sdider/DB/GATK/bwa/", - "tools": "bamqc", + "tools": "fusions", "outdir": "./results", "pon_dryclean": "/gpfs/commons/home/sdider/Projects/nf-casereports/tests/test_data/chr21_pon.rds", "field_dryclean": "reads", diff --git a/tests/test_runs/full_test/params.json b/tests/test_runs/full_test/params.json index 975e636..e12c32a 100644 --- a/tests/test_runs/full_test/params.json +++ b/tests/test_runs/full_test/params.json @@ -6,7 +6,7 @@ "bwa": "/gpfs/commons/home/sdider/DB/GATK/bwa/", "outdir": "./results", "pon_dryclean": "/gpfs/commons/home/tdey/data/dryclean/MONSTER_PON_RAW/MONSTER_PON_RAW_SORTED/fixed.detergent.rds", - "tools": "all", + "tools": "fusions", "field_dryclean": "reads", "genome": "GATK.GRCh37", "email": "shihabdider@gmail.com" diff --git a/workflows/nfcasereports.nf b/workflows/nfcasereports.nf index 8aae161..ad70b2c 100644 --- a/workflows/nfcasereports.nf +++ b/workflows/nfcasereports.nf @@ -1621,20 +1621,31 @@ workflow NFCASEREPORTS { // ############################## if (tools_used.contains("all") || tools_used.contains("fusions")) { fusions_inputs = inputs.filter { it.fusions.isEmpty() }.map { it -> [it.meta.patient, it.meta] } - fusions_input_non_integer_balance = non_integer_balance_balanced_gg_for_merge - .join(fusions_inputs) - .map { it -> [ it[0], it[1] ] } // meta.patient, balanced_gg - fusions_existing_outputs = inputs.map { it -> [it.meta, it.fusions] }.filter { !it[1].isEmpty() } + if (tools_used.contains("non_integer_balance")) { + fusions_input_non_integer_balance = non_integer_balance_balanced_gg_for_merge + .join(fusions_inputs) + .map { it -> [ it[0], it[1] ] } // meta.patient, balanced_gg + fusions_input = fusions_inputs + .join(fusions_input_non_integer_balance) + .map{ patient, meta, balanced_gg -> [ meta, balanced_gg, [], [] ] } + } else { + fusions_input_sv = vcf_from_sv_calling_for_merge + .join(fusions_inputs) + .map { it -> [ it[0], it[1], it[2] ] } // meta.patient, vcf, vcf_tbi + fusions_input = fusions_inputs + .join(fusions_input_sv) + .map{ patient, meta, sv, sv_tbi -> [ meta, [], sv, sv_tbi ] } + } - fusions_input = fusions_inputs - .join(fusions_input_non_integer_balance) - .map{ patient, meta, balanced_gg -> [ meta, balanced_gg ] } + fusions_existing_outputs = inputs.map { it -> [it.meta, it.fusions] }.filter { !it[1].isEmpty() } FUSIONS(fusions_input) fusions = Channel.empty() .mix(FUSIONS.out.fusions_output) .mix(fusions_existing_outputs) + altedge_annotations = Channel.empty() + .mix(FUSIONS.out.altedge_annotations) versions = Channel.empty().mix(FUSIONS.out.versions) }