Skip to content

Commit

Permalink
feat: update fusions process
Browse files Browse the repository at this point in the history
  • Loading branch information
shihabdider committed Oct 18, 2024
1 parent d4422a1 commit 835e916
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 30 deletions.
62 changes: 47 additions & 15 deletions bin/Fusions.R
Original file line number Diff line number Diff line change
@@ -1,33 +1,39 @@
library(optparse)
options(bitmapType='cairo')

message("R Libraries:")
print(.libPaths())

options(error = function() {traceback(2); quit("no", 1)})
if (!exists('opt'))
{
option_list = list(
make_option(c("-l", "--libdir"), type = "character", help = "libdir"),
make_option(c("-i", "--id"), type = "character", help = "sample id"),
make_option(c("-g", "--gGraph"), type = "character", help = "an RDS file contains a gGraph or JaBbA graph with cn annotation on nodes and edges"),
make_option(c("-j", "--junctions"), type = "character", help = "an RDS file contains a gGraph or JaBbA graph with cn annotation on nodes and edges, or is a BND formatted SV VCF"),
make_option(c("-r", "--gencode"), type = "character", default = "~/DB/GENCODE/gencode.v19.annotation.gtf.nochr.rds", help = "an RDS or GTF file of GENCODE"),
make_option(c("--genes"), type = "character", default = "/dev/null", help = "Gene list to filter junction breakends"),
make_option(c("--do_only_canonical"), type = "logical", default = TRUE, help = "Use only canonical (longest per gene), protein-coding transcripts"),
make_option(c("-o", "--outdir"), type = "character", default = './', help = "Directory to dump output into"),
make_option(c("--cores"), type = "integer", default = 1L, help = "Number of cores")
)
parseobj = OptionParser(option_list=option_list)
# print(parseobj)
opt = parse_args(parseobj)
print(opt)
saveRDS(opt, paste(opt$outdir, 'cmd.args.rds', sep = '/'))
}

suppressPackageStartupMessages({
library(gGnome)
library(gUtils)
library(parallel) ## needed for mc.cores
library(parallel) ## needed for mc.cores
library(rtracklayer)
library(GenomeInfoDb)
})


## setDTthreads(10)
message("Loading GENCODE: ", opt$gencode)
if (grepl('.rds$', opt$gencode))
{
gencode = readRDS(as.character(opt$gencode))
Expand All @@ -48,30 +54,56 @@ if (opt$do_only_canonical) {
)[, .(transcript_id = transcript_id[which.max(end-start+1)]), by = gene_name]
)
gencode = gencode[
gencode$transcript_id %in% canonical_transcripts$transcript_id
gencode$transcript_id %in% canonical_transcripts$transcript_id
| (gencode$type == "gene" & gencode$gene_name %in% canonical_transcripts$gene_name)
]
}


if (grepl(".rds$", opt$gGraph)) {
gg_input = readRDS(opt$gGraph)
message("Loading SV input: ", opt$junctions)
if (grepl(".rds$", opt$junctions)) {
gg_input = readRDS(opt$junctions)
if (!identical(c("gGraph", "R6"), class(gg_input)) && is.list(gg_input) && any(c("segstats", "purity", "ploidy") %in% names(gg_input)))
gg_input = gGnome::gG(jabba = gg_input) # is jabba list object, convert
} else if (grepl(".vcf(.gz|.bz|.bz2|.xz)?$", opt$gGraph)) {
gg_input = gGnome::gG(junctions = opt$gGraph)
message("Refreshing gGraph input")
gg_input = gGnome::refresh(gg_input)
} else if (grepl(".vcf(.gz|.bz|.bz2|.xz)?$", opt$junctions)) {
gg_input = gGnome::gG(junctions = opt$junctions)
} else {
stop("Unknown input file type: ", opt$gGraph)
stop("Unknown input file type: ", opt$junctions)
}
fus = fusions(gg_input, gencode, verbose = TRUE, mc.cores = opt$cores)

## update events with sample id
if (length(fus))
{
if (!identical(opt$genes, "/dev/null") && file.exists(opt$genes)) {
message("Loading gene list to filter input SVs:")
cat(normalizePath(opt$genes))
gene_list = readLines(opt$genes)
gencode_by_genes = split(gencode, gencode$gene_name)
genes_not_present = setdiff(gene_list, names(gencode_by_genes))
if (length(genes_not_present)) {
message("The following genes are not present in gencode definition")
dput(genes_not_present)
}
genes_present = intersect(gene_list, names(gencode_by_genes))
message("Number of unique genes to query in gencode: ", length(genes_present))
message("\n")
message("Creating new gGraph from junctions overlapping with gene set")
alt_juncs = gg_input$edges$junctions[type == "ALT"]
which_juncs_in_genes = which(alt_juncs %^% unlist(gencode_by_genes[genes_present]))
message(length(which_juncs_in_genes), " ALT junctions in gene set")
gg_input = gGnome::gG(junctions=alt_juncs[which_juncs_in_genes])
rm(gencode_by_genes)
}

fus_lst = fusions(gg_input, gencode, verbose = TRUE, mc.cores = opt$cores, return.all.alt.edges = TRUE)
fus = fus_lst$fus
allaltedges.ann = fus_lst$allaltedges.ann

if (length(fus)) {
fus$set(id = opt$id)
fus$set(mincn = fus$eval(edge = min(cn, na.rm = TRUE)))
is_cn_attribute_present = NROW(fus$edges$dt$cn)
if (is_cn_attribute_present) fus$set(mincn = fus$eval(edge = min(cn, na.rm = TRUE)))
}

saveRDS(fus, paste0(opt$outdir, '/', 'fusions.rds'))
data.table::fwrite(allaltedges.ann, paste0(opt$outdir, '/', 'altedge.annotations.tsv'), sep = "\t")

quit("no", 0)
2 changes: 1 addition & 1 deletion conf/modules/fusions.config
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ process {
publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/fusions/${meta.id}/" },
pattern: "*{.rds*,.command.*}"
pattern: "*{.rds*,.command.*, .tsv}"
]
}
}
6 changes: 4 additions & 2 deletions modules/local/fusions/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ process FUSIONS {
'mskilab/events:latest' }"

input:
tuple val(meta), path(gGraph)
tuple val(meta), path(gGraph), path(sv_vcf), path(sv_vcf_tbi)
path(gencode)

output:
tuple val(meta), path("*fusions.rds") , emit: fusions_output
tuple val(meta), path("*altedge.annotations.tsv") , emit: altedge_annotations
path "versions.yml" , emit: versions

when:
Expand All @@ -23,6 +24,7 @@ process FUSIONS {
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.id}"
def id = "${meta.sample}"
def junctions_flag = sv_vcf ? "--junctions ${sv_vcf}" : "--junctions ${gGraph}"
def VERSION = '0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.

"""
Expand All @@ -31,7 +33,7 @@ process FUSIONS {
Rscript \$RSCRIPT_PATH \\
--id $id \\
--gGraph $gGraph \\
${junctions_flag} \\
--gencode $gencode \\
--cores ${task.cpus}
Expand Down
6 changes: 3 additions & 3 deletions subworkflows/local/fusions/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,19 @@ gencode = WorkflowNfcasereports.create_file_channel(params.g

workflow GGRAPH_FUSIONS {
take:
input // required: format [val(meta), path(gGraph)]
input // required: format [val(meta), path(gGraph)] or [val(meta), []. path(sv_vcf), path(sv_vcf_tbi)]

main:
versions = Channel.empty()
fusions_output = Channel.empty()

FUSIONS(input, gencode)

fusions_output = FUSIONS.out.fusions_output
altedge_annotations = FUSIONS.out.altedge_annotations
versions = FUSIONS.out.versions

emit:
fusions_output
altedge_annotations

versions
}
2 changes: 1 addition & 1 deletion tests/test_runs/chr21_test/params.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"fasta": "/gpfs/commons/home/sdider/DB/GATK/human_g1k_v37_decoy.fasta",
"fasta_fai": "/gpfs/commons/home/sdider/DB/GATK/human_g1k_v37_decoy.fasta.fai",
"bwa": "/gpfs/commons/home/sdider/DB/GATK/bwa/",
"tools": "bamqc",
"tools": "fusions",
"outdir": "./results",
"pon_dryclean": "/gpfs/commons/home/sdider/Projects/nf-casereports/tests/test_data/chr21_pon.rds",
"field_dryclean": "reads",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_runs/full_test/params.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"bwa": "/gpfs/commons/home/sdider/DB/GATK/bwa/",
"outdir": "./results",
"pon_dryclean": "/gpfs/commons/home/tdey/data/dryclean/MONSTER_PON_RAW/MONSTER_PON_RAW_SORTED/fixed.detergent.rds",
"tools": "all",
"tools": "fusions",
"field_dryclean": "reads",
"genome": "GATK.GRCh37",
"email": "[email protected]"
Expand Down
25 changes: 18 additions & 7 deletions workflows/nfcasereports.nf
Original file line number Diff line number Diff line change
Expand Up @@ -1621,20 +1621,31 @@ workflow NFCASEREPORTS {
// ##############################
if (tools_used.contains("all") || tools_used.contains("fusions")) {
fusions_inputs = inputs.filter { it.fusions.isEmpty() }.map { it -> [it.meta.patient, it.meta] }
fusions_input_non_integer_balance = non_integer_balance_balanced_gg_for_merge
.join(fusions_inputs)
.map { it -> [ it[0], it[1] ] } // meta.patient, balanced_gg

fusions_existing_outputs = inputs.map { it -> [it.meta, it.fusions] }.filter { !it[1].isEmpty() }
if (tools_used.contains("non_integer_balance")) {
fusions_input_non_integer_balance = non_integer_balance_balanced_gg_for_merge
.join(fusions_inputs)
.map { it -> [ it[0], it[1] ] } // meta.patient, balanced_gg
fusions_input = fusions_inputs
.join(fusions_input_non_integer_balance)
.map{ patient, meta, balanced_gg -> [ meta, balanced_gg, [], [] ] }
} else {
fusions_input_sv = vcf_from_sv_calling_for_merge
.join(fusions_inputs)
.map { it -> [ it[0], it[1], it[2] ] } // meta.patient, vcf, vcf_tbi
fusions_input = fusions_inputs
.join(fusions_input_sv)
.map{ patient, meta, sv, sv_tbi -> [ meta, [], sv, sv_tbi ] }
}

fusions_input = fusions_inputs
.join(fusions_input_non_integer_balance)
.map{ patient, meta, balanced_gg -> [ meta, balanced_gg ] }
fusions_existing_outputs = inputs.map { it -> [it.meta, it.fusions] }.filter { !it[1].isEmpty() }

FUSIONS(fusions_input)
fusions = Channel.empty()
.mix(FUSIONS.out.fusions_output)
.mix(fusions_existing_outputs)
altedge_annotations = Channel.empty()
.mix(FUSIONS.out.altedge_annotations)
versions = Channel.empty().mix(FUSIONS.out.versions)
}

Expand Down

0 comments on commit 835e916

Please sign in to comment.