From 2b0deb69c33cbb5bfbb52df9a6220226310ec382 Mon Sep 17 00:00:00 2001 From: shihabdider Date: Tue, 7 May 2024 14:45:42 -0400 Subject: [PATCH 1/3] refactor: fix inputs of post jabba processes --- bin/Events.R | 2 +- bin/Fusions.R | 50 + bin/cbsFH.R | 2 +- bin/lp_phased_balance.R | 2371 +++++++++++++++++++- bin/non_integer_balance.R | 40 +- conf/igenomes.config | 6 +- conf/modules/alleic_cn.config | 36 - conf/modules/allelic_cn.config | 70 + main.nf | 4 + modules/local/allelic_cn/main.nf | 24 +- modules/local/ascat/main.nf | 2 +- modules/local/cbs/main.nf | 1 - modules/local/dryclean/main.nf | 14 +- modules/local/fusions/main.nf | 3 +- modules/local/gridss/gridss/main.nf | 6 +- nextflow.config | 10 +- nextflow_schema.json | 8 - subworkflows/local/allelic_cn/main.nf | 4 + subworkflows/local/bam_fragCounter/main.nf | 7 +- subworkflows/local/cov_dryclean/main.nf | 21 +- tests/modules/local/dryclean/main.nf.test | 5 +- tests/nextflow.config | 15 +- workflows/nfjabba.nf | 253 +-- 23 files changed, 2678 insertions(+), 276 deletions(-) create mode 100644 conf/modules/allelic_cn.config diff --git a/bin/Events.R b/bin/Events.R index 6b56dee..6bb242f 100755 --- a/bin/Events.R +++ b/bin/Events.R @@ -39,7 +39,7 @@ withAutoprint( min.span = 1e6, max.small = 1e4) newtic = tic - overwritefun("newtic", "tic", "gGnome") + # overwritefun("newtic", "tic", "gGnome") ## call complex events diff --git a/bin/Fusions.R b/bin/Fusions.R index e69de29..372ee4c 100644 --- a/bin/Fusions.R +++ b/bin/Fusions.R @@ -0,0 +1,50 @@ +withAutoprint( + { + library(optparse) + options(bitmapType = "cairo") + + options(error = function() { + traceback(2) + quit("no", 1) + }) + if (!exists("opt")) { + option_list <- list( + make_option(c("-i", "--id"), type = "character", help = "sample id"), + make_option(c("-g", "--gGraph"), type = "character", help = "an RDS file contains a gGraph or JaBbA graph with cn annotation on nodes and edges"), + make_option(c("-r", "--gencode"), type = "character", help = "an RDS or GTF file of GENCODE"), + make_option(c("-o", "--outdir"), type = "character", default = "./", help = "Directory to dump output into"), + make_option(c("--cores"), type = "integer", default = 1L, help = "Number of cores") + ) + parseobj <- OptionParser(option_list = option_list) + opt <- parse_args(parseobj) + saveRDS(opt, paste(opt$outdir, "cmd.args.rds", sep = "/")) + } + + library(gGnome) + library(gUtils) + library(parallel) ## needed for mc.cores + + ## setDTthreads(10) + if (grepl(".rds$", opt$gencode)) { + gencode <- readRDS(as.character(opt$gencode)) + } else { + gencode <- rtracklayer::import(opt$gencode) + } + + + ## call complex events + ## fus = fusions(gG(jab = opt$gGraph), gencode, verbose = TRUE, opt$cores) + fus <- fusions(gG(jab = opt$gGraph), gencode, verbose = TRUE, mc.cores = opt$cores) + + ## update events with sample id + if (length(fus)) { + fus$set(id = opt$id) + fus$set(mincn = fus$eval(edge = min(cn, na.rm = TRUE))) + } + + saveRDS(fus, paste0(opt$outdir, "/", "fusions.rds")) + + quit("no", 0) + }, + echo = FALSE +) diff --git a/bin/cbsFH.R b/bin/cbsFH.R index e7b3ced..8a2ab2f 100644 --- a/bin/cbsFH.R +++ b/bin/cbsFH.R @@ -8506,7 +8506,7 @@ ppgrid = function( else ## only row, only go left right M = (NLLc < NLLcl & NLLc < NLLcr)[-c(1, nrow(NLLc)), -c(1, ncol(NLLc)), drop = FALSE] - if (length(M)>1) + if (length(M)>1 & any(M, na.rm=TRUE)) { ix = which(M, arr.ind= T); if (nrow(ix)>1) diff --git a/bin/lp_phased_balance.R b/bin/lp_phased_balance.R index b920905..b3323e4 100644 --- a/bin/lp_phased_balance.R +++ b/bin/lp_phased_balance.R @@ -35,18 +35,588 @@ ## updated balance? yikes haha library(JaBbA) - devtools::load_all("~/git/gGnome_ZC") - devtools::load_all("~/git/zitools") + library(zitools) + library(gGnome) library(gUtils) library(skitools) library(DNAcopy) - ## the following are neede for grab.hets and grab.hets.from.maf but not doing that for now.. - ## source("~/projects/phasing/phasing_utils.R") - ## source("~/projects/phasing/tmp.R") + # utils.R has been pasted below + # source("~/utils.R") - source("~/utils.R") + ## start of utils.R + + #' @name diploid2haploid + #' @title diploid2haploid + #' + #' @param gg (gGraph) diploid gGraph (expect cn.low and cn.high as node metadata) + #' @param verbose (logical) default FALSE + #' + #' @return gGraph with field allele and cn + diploid2haploid = function(gg, verbose = FALSE) { + og.nodes.gr = gg$nodes$gr[, c("cn.low", "cn.high", "var.low", "var.high", "cn", "node.id", "nhets")] + values(og.nodes.gr)[, "og.node.id"] = values(gg$nodes$gr)[, "node.id"] + names(values(og.nodes.gr))[names(values(og.nodes.gr)) == "cn"] = "cn.total" + + ## prepare nodes for melted graph + phased.gg.nodes = c(og.nodes.gr, og.nodes.gr) + values(phased.gg.nodes)[, "cn"] = c(values(og.nodes.gr)[, "cn.high"], values(og.nodes.gr)[, "cn.low"]) + values(phased.gg.nodes)[, "allele"] = c(rep("major", length(og.nodes.gr)), rep("minor", length(og.nodes.gr))) + values(phased.gg.nodes)[, "variance"] = c(values(og.nodes.gr)[, "var.high"], values(og.nodes.gr)[, "var.low"]) + values(phased.gg.nodes)[, "variance"] = c(values(og.nodes.gr)[, "var.high"], values(og.nodes.gr)[, "var.low"]) + ## bound the variance + values(phased.gg.nodes)[, "var.adj"] = pmax(pmax(values(phased.gg.nodes)[, "variance"], values(phased.gg.nodes)[, "cn"]), 1) + + ## compute the weight + values(phased.gg.nodes)[, "weight"] = values(phased.gg.nodes)[, "nhets"] / values(phased.gg.nodes)[, "var.adj"] + + ## prepare edges for melted graph + phased.gg.edges = rbind( + gg$edges$dt[, .(n1, n2, n1.side, n2.side, type, + og.edge.id = edge.id, + n1.allele = "major", + n2.allele = "major")], + gg$edges$dt[, .(n1 = n1 + length(og.nodes.gr), n2 = n2 + length(og.nodes.gr), type, + n1.side, n2.side, + og.edge.id = edge.id, + n1.allele = "minor", + n2.allele = "minor")], + gg$edges$dt[, .(n1, n2 = n2 + length(og.nodes.gr), type, + n1.side, n2.side, + og.edge.id = edge.id, + n1.allele = "major", + n2.allele = "minor")], + gg$edges$dt[, .(n1 = n1 + length(og.nodes.gr), n2, type, + n1.side, n2.side, + og.edge.id = edge.id, + n1.allele = "minor", + n2.allele = "major")] + ) + + ## add n1/n2 chromosome information + phased.gg.edges[, ":="(n1.chr = seqnames(phased.gg.nodes)[n1] %>% as.character, + n2.chr = seqnames(phased.gg.nodes)[n2] %>% as.character)] + + ## add edge connection type (straight/cross) + phased.gg.edges[n1.chr == n2.chr & n1.allele == n2.allele, connection := "straight"] + phased.gg.edges[n1.chr == n2.chr & n1.allele != n2.allele, connection := "cross"] + + phased.gg = gG(nodes = phased.gg.nodes, edges = phased.gg.edges) + + ref.edge.col = alpha("blue", 0.3) + alt.edge.col = alpha("red", 0.3) + ref.edge.lwd = 0.5 + alt.edge.lwd = 1.0 + phased.gg$edges$mark(col = ifelse(phased.gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col), + lwd = ifelse(phased.gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd)) + + major.node.col = alpha("red", 0.5) + minor.node.col = alpha("blue", 0.5) + phased.gg$nodes$mark(col = ifelse(phased.gg$nodes$dt$allele == "major", major.node.col, minor.node.col), + ywid = 0.8) + + return(phased.gg) + } + + #################### + #' @name jabba.alleles2 + #' @title jabba.alleles2 + #' @rdname internal + #' jabba.alleles + #' + #' @description + #' Populates allelic value s for JaBbA object. This does not explicitly impose junction balance constraints on alleles, but rather just computes + #' the maximum likelihood estimate given allelic counts and the inferred total copy number on a given segment according to JaBbA + #' + #' @param jab JaBbA object + #' @param het.sites GRanges with meta data fields (see below) for alt and rref count + #' @param alt.count.field character specifying alt.count meta data field in input het.sites (default $alt) + #' @param ref.count.field character specifying ref.count meta data field in input het.sites (default $ref) + #' @param split.ab logical flag whether to split aberrant segmetns (segmentss with ab edge entering or leaving prior to computing allelic states (default FALSE) + #' @param uncoupled logical flag whether to not collapse segments after inferring MLE estimate (default FALSE), if FALSE will try to merge adjacent segments and populate allele-specific junctions with copy numbers on the basis of the MLE fit on individual allelic segments + #' @param conservative if TRUE then will leave certain allelic segments "unphased" if one cannot sync the high / low interval state with the incoming and / or outgoing junction state + #' @param marginal fix marginal? default TRUE + #' @param verbose logical flag + #' @return + #' list with following fields: + #' $segstats = GRanges of input segments with $cn.high and $cn.low segments populated + #' $asegstats = GRanges of allelic segments (length is 2*length(segstats)) with high and low segments each having $cn, this is a "melted" segstats GRAnges + #' $agtrack = gTrack of allelic segments and supporting input het.sites + #' $aadj = allelic adjacency matrix of allele specific junctions + #' $ab.ix = indices of aberrant edges in $aadj + #' $ref.ix = indices of reference edges in $aadj + ############################################ + jabba.alleles2 = function(jab, + het.sites, ## granges with meta data fields for alt.count and + alt.count.field = 'alt', + ref.count.field = 'ref', + baf.field = 'baf.t', + split.ab = F, ## if split.ab == T, then will split across any "aberrant" segment (i.e. segment with ab edge entering or leaving prior to computing allelic states (note: this might create gaps) + uncoupled = FALSE, ## if uncoupled, we just assign each high low allele the MLE conditioning on the total copy number + conservative = FALSE, ## if TRUE then will leave certain allelic segments "unphased" if one cannot sync the high / low interval state with the incoming and / or outgoing junction state + marginal = TRUE, + verbose = F + ) + { + if (!all(c(alt.count.field, ref.count.field) %in% names(values(het.sites)))){ + jwarning('count fields not found in meta data of het.sites input, trying BAF...') + if (!(baf.field %in% names(values(het.sites)))) + jerror('BAF field not found in meta data of het.sites input either!') + else{ + ## outputs are re.seg$low and re.seg$high + ## test deviations of observed BAF from expected by beta distribution + if (verbose) + message('Processing', length(het.sites), + 'het sites using fields', baf.field, '\n') + + } + } else { + ## jerror('count fields not found in meta data of het.sites input') + + if (verbose) + { + message('Processing ', length(het.sites), ' het sites using fields ', alt.count.field, ' and ', ref.count.field) + } + + het.sites$low.count = pmin(values(het.sites)[, alt.count.field], values(het.sites)[, ref.count.field]) + het.sites$high.count = pmax(values(het.sites)[, alt.count.field], values(het.sites)[, ref.count.field]) + + het.sites = het.sites[!is.na(het.sites$low.count) & !is.na(het.sites$high.count)] + + ## stretch out het sites + bin.gaps = gaps(het.sites) + bin.gaps = bin.gaps %Q% (strand(bin.gaps) == "*") + bin.gaps = resize(bin.gaps, width = width(bin.gaps) + 1, fix = "start") + values(bin.gaps)[, "low.count"] = gr.val(query = bin.gaps[, c()], + target = het.sites, + val = "low.count", + mean = TRUE, + na.rm = TRUE)$low.count + values(bin.gaps)[, "high.count"] = gr.val(query = bin.gaps[, c()], + target = het.sites, + val = "high.count", + mean = TRUE, + na.rm = TRUE)$high.count + + het.sites = bin.gaps ## use these stretched out sites + + het.sites = het.sites[!is.na(het.sites$low.count) & !is.na(het.sites$high.count)] + + ss.p = jab$segstats[ as.logical( strand(jab$segstats)=='+' ) ] + + ## find the reference junctions + ord.ix = order(jab$segstats) + rev.ix = as.logical(strand(jab$segstats[ord.ix]) == '-') + ord.ix = c(ord.ix[!rev.ix], rev(ord.ix[rev.ix])) + + ref.jun = cbind(ord.ix[-length(ord.ix)], ord.ix[-1]) + ref.jun = ref.jun[which(jab$adj[ref.jun]>0), ] + + has.ab.rand = 0 + if (split.ab) + { + ab.adj = jab$adj + ab.adj[ref.jun] = 0 + has.ab = as.numeric(Matrix::rowSums(ab.adj!=0)!=0 | Matrix::colSums(ab.adj!=0)!=0)[which( as.logical( strand(jab$segstats)=='+')) ] + has.ab.rand = runif(length(ss.p)) * 1e-6 * has.ab + } + + ss.p = ss.p[!is.na(ss.p$cn)] + ## browser() + #' zchoo Wednesday, Apr 27, 2022 08:34:21 AM + ## don't do this since this merges ranges with the same score! + ## re.seg = as(coverage(ss.p, weight = ss.p$cn + has.ab.rand), 'GRanges') + re.seg = ss.p[, "cn"] + ## re.seg$cn = round(re.seg$score) + + het.sites$ix = gr.match(het.sites, re.seg) + + if (verbose) + { + message('Computed high / low counts and matched to segs') + } + + + highs = split(het.sites$high.count, het.sites$ix)[as.character(seq_along(re.seg))] + lows = split(het.sites$low.count, het.sites$ix)[as.character(seq_along(re.seg))] + + het.sites$cn = re.seg$cn[het.sites$ix] + purity = jab$purity + ploidy = mean(het.sites$cn, na.rm = T) ## ploidy may be slightly different from "global ploidy" depending on the distribution of sites + + sw = length(het.sites) + total = sum(as.numeric(c(het.sites$high.count, het.sites$low.count))) + + cn = re.seg$cn + ## gamma = 2*(1-purity)/purity ## gammas and betas need to be recomputed for + ## beta = (2*(1-purity)*sw + purity*ploidy*sw) / (purity * total) + gamma = 1*(1-purity)/purity ## gammas and betas need to be recomputed for (1 since we are looking at het alleles) + beta = (1*(1-purity)*sw + purity*ploidy*sw) / (purity * total) + centers = (0:(max(cn)+1) + gamma)/beta + + if (verbose) + { + message('Computed SNP ploidy and allelic copy centers') + } + + ## now test deviation from each absolute copy combo using poisson model + ## i.e. counts ~ poisson(expected mean) + ## + re.seg.tmp = lapply(seq_along(re.seg), function(i) + { + ## if (verbose) + ## cat('.') + x = lows[[i]] + if (length(x)==0) + return(list(low = NA, high = NA)) + y = highs[[i]] + tot.cn = cn[i] + ## allow for +/- errors + ## use negative binomial?? + ## ll = sapply(0:(floor(tot.cn/2)), function(j) sum(pnbinom(x, mu = centers[j+1], + ## size = 0,##centers[j+1] / 2, + ## log.p = T) + + ## pnbinom(y, mu = centers[tot.cn - j + 1], + ## size = centers[tot.cn - j + 1], + ## log.p = T))) + ll = sapply(0:(floor(tot.cn/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + + ppois(y,centers[tot.cn-j+1],log.p = T))) + ll = ll - min(ll) + curr.best = max(ll) + curr.cn = which.max(ll) - 1 + curr.hcn = tot.cn - curr.cn + if (!marginal) { + if (tot.cn > 1) { + ## ll = sapply(0:(floor(tot.cn/2)), function(j) sum(pnbinom(x, mu = centers[j+1], + ## size = centers[j+1] / 2, + ## log.p = T) + + ## pnbinom(y, mu = centers[tot.cn - j], + ## size = centers[tot.cn - j] / 2, + ## log.p = T))) + ll = sapply(0:(floor((tot.cn - 1)/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + + ppois(y,centers[tot.cn-j],log.p = T))) + ll = ll - min(ll) + if (max(ll) > curr.best) { + curr.best = max(ll) + curr.cn = which.max(ll) - 1 + curr.hcn = tot.cn - curr.cn - 1 + } + } + ## ll = sapply(0:(floor(tot.cn/2)), function(j) sum(pnbinom(x, mu = centers[j+1], + ## size = centers[j+1], + ## log.p = T) + + ## pnbinom(y, mu = centers[tot.cn - j + 2], + ## size = centers[tot.cn - j + 2], + ## log.p = T))) + ll = sapply(0:(floor((tot.cn + 1)/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + + ppois(y,centers[tot.cn-j+2],log.p = T))) + ll = ll - min(ll) + if (max(ll) > curr.best) { + curr.best = max(ll) + curr.cn = which.max(ll) - 1 + curr.hcn = tot.cn - curr.cn + 1 + } + } + ##return(list(low = curr.cn, high = tot.cn - curr.cn)) + return(list(low = curr.cn, high = curr.hcn)) + }) + + re.seg$low = lapply(re.seg.tmp, function(x) {x$low}) %>% unlist + re.seg$high = lapply(re.seg.tmp, function(x) {x$high}) %>% unlist + } + ## ######################################################################### + ## borderline, below are common to both methods + ## no need to round and NA segments are fine + jab$segstats$cn.low = gr.val(jab$segstats, re.seg, 'low', na.rm = TRUE)$low + jab$segstats$cn.high = gr.val(jab$segstats, re.seg, 'high', na.rm = TRUE)$high + + ## also get number of hets per segment + jab$segstats$nhets = jab$segstats %N% het.sites + + ## and the variance + jab$segstats$var.low = gr.val(query = jab$segstats, + target = het.sites, + val = 'low.count', + na.rm = TRUE, + FUN = function(x, w, na.rm) {var(x, na.rm = na.rm)})$low.count + + jab$segstats$var.high = gr.val(query = jab$segstats, + target = het.sites, + val = 'high.count', + na.rm = TRUE, + FUN = function(x, w, na.rm) {var(x, na.rm = na.rm)})$high.count + + ## NA-out some nodes + na.ix = (!gr.val(jab$segstats, re.seg, 'low', FUN = function(x,w,na.rm) any(!is.na(x)))$low) | + (!gr.val(jab$segstats, re.seg, 'high', FUN = function(x,w,na.rm) any(!is.na(x)))$high) + jab$segstats$cn.low[na.ix] = jab$segstats$cn.high[na.ix] = NA + jab$segstats$var.low[na.ix] = jab$segstats$var.high[na.ix] = NA + jab$segstats$nhets[na.ix] = 0 + + ## ## ########### + ## ## phasing + ## ## ########### + + ## ## iterate through all reference junctions and apply (wishful thinking) heuristic + ## ## + ## ## populate n x n x 2 adjacency matrix, which we will later expand to a bigger matrix + ## adj.ab = jab$adj + ## adj.ab[ref.jun] = 0 + ## adj.ref = jab$adj*0 + ## adj.ref[ref.jun] = jab$adj[ref.jun] + ## high = low = jab$segstats[, c()] + ## high$cn = jab$segstats$cn.high + ## low$cn = jab$segstats$cn.low + ## high$parent = low$parent = seq_along(jab$segstats) + ## high$type = 'high' + ## low$type = 'low' + ## high$id = seq_along(jab$segstats) + ## low$id = length(jab$segstats) + seq_along(jab$segstats) + ## asegstats = c(high, low) + ## amap = cbind(high$id, low$id) ## maps segstats id x allele combos to asegstats id + + ## aadj = sparseMatrix(1, 1, x = 0, dims = c(length(asegstats), length(asegstats))) + + ## .flip = function(x) x %% 2+1 + + ## asegstats = c(high, low) + ## acn = cbind(high$cn, low$cn) + + ## phased.out = phased.in = rep(TRUE, length(asegstats)) + + ## str = strand(asegstats) + + ## if (verbose) + ## message('Starting phasing ') + + ## for (k in 1:nrow(ref.jun)) + ## { + ## i = ref.jun[k, 1] + ## j = ref.jun[k, 2] + ## a = acn[ref.jun[k,1],] + ## b = acn[ref.jun[k,2],] + + ## phased.out[amap[i, ]] = FALSE + ## phased.in[amap[j, ]] = FALSE + + ## pairs.ij = cbind(rep(c(1:2), 2), rep(c(1:2), each = 2)) ## 4 possible matches + ## m = setdiff(which(a[pairs.ij[,1]] == b[pairs.ij[,2]]), NA) + + ## if (!(length(m) %in% c(0, 4))) ## 1,2, and 3 matches are fine (3 matches occur if one interval is in allelic balance, and the other not + ## { + ## if (length(m)==2) ## pick the phase that the alleles can handle + ## m = rev(m[order(as.numeric(sum(adj.ab[i, ])<=a[pairs.ij[m,1]]) + as.numeric(sum(adj.ab[, j])<=b[pairs.ij[m,2]]))]) + + ## m.ij = pairs.ij[m[1], ] + ## fm.ij = .flip(m.ij) + ## aadj[amap[i, m.ij[1]], amap[j, m.ij[2]]] = min(a[m.ij[1]], jab$adj[i, j]) + ## aadj[amap[i, fm.ij[1]], amap[j, fm.ij[2]]] = jab$adj[i, j] - aadj[amap[i, m.ij[1]], amap[j, m.ij[2]]] + + ## phased.out[amap[i, ]] = TRUE + ## phased.in[amap[j, ]] = TRUE + + ## if (length(a.ab <- Matrix::which(adj.ab[i,]!=0))>0) + ## { + ## ## if a.ab (partner) is already phased then unpopulate the non-ab allelic junction, otherwise populate both alleles of partner + ## ## BUG: a.ab is length 2???? + ## ## hack: replace a.ab with a.ab[1] + ## if (any(ph <- aadj[amap[i, fm.ij[1]], amap[a.ab[1], ]] !=0)) + ## { + ## aadj[amap[i, fm.ij[1]], amap[a.ab[1], ph]] = adj.ab[i, a.ab[1]] + ## aadj[amap[i, m.ij[1]], amap[a.ab[1], ph]] = 0 + ## } + ## else + ## ## otherwise diffuse copy into both alleles of the partner (will be resolved when we resolve phase for the partner interval) + ## ## or collapse unphased nodes back + ## aadj[amap[i, fm.ij[1]], amap[a.ab[1], ]] = adj.ab[i, a.ab[1]]/2 + + ## if (!conservative) + ## if (a[fm.ij[1]] < adj.ab[i, a.ab]) # if the allelic node can't handle the outgoing allelic edge flux, so unphase + ## phased.out[amap[i, ]] = FALSE + ## } + + ## if (length(b.ab <- Matrix::which(adj.ab[,j]!=0))>0) + ## { + ## ## if b.ab (partner) is already phased then concentrate all of the junction copy into the aberrant allele of this interval + ## ## BUG: why b.ab is length 2???? I thought we resolved this long ago + ## ## hack: replace a.ab with a.ab[1] + ## if (any(ph <- aadj[amap[b.ab[1], ], amap[j, fm.ij[2]]] !=0)) + ## { + ## aadj[amap[b.ab[1], ph], amap[j, fm.ij[2]]] = adj.ab[b.ab[1], j] + ## aadj[amap[b.ab[1], ph], amap[j, m.ij[2]]] = 0 + ## } + ## else + ## ## otherwise diffuse copy into both alleles of the partner (will be resolved when we resolve phase for the partner interval) + ## ## or collapse unphased nodes back + ## aadj[amap[b.ab[1],], amap[j, fm.ij[2]]] = adj.ab[b.ab[1], j]/2 + + ## if (!conservative) + ## if (b[fm.ij[2]] < adj.ab[b.ab, j]) # the allelic node cn can't handle the incoming allelic edge flux, so unphase + ## phased.in[amap[j, ]] = FALSE + ## } + ## } + ## } + + ## if (verbose) + ## message('Finished phasing, finalizing.') + + ## asegstats$phased.in = phased.in + ## asegstats$phased.out = phased.out + + ## if (uncoupled) + ## unphased = rep(FALSE, length(asegstats)) + ## else + ## unphased = !asegstats$phased.out | !asegstats$phased.in + + ## unphased.parents = unique(asegstats$parent[unphased]) + ## aadj.unphunph = jab$adj[unphased.parents, unphased.parents] + ## aadj.phph = aadj[!unphased, !unphased] + + ## asegstats$new.ind = NA + ## asegstats$new.ind[!unphased] = 1:sum(!unphased) + ## asegstats$new.ind[unphased] = as.integer(factor(asegstats$parent[unphased], unphased.parents)) + ## mat.collapse = sparseMatrix(which(unphased), asegstats$new.ind[unphased], x = 1, dims = c(nrow(aadj), length(unphased.parents))) + + ## aadj.phunph = aadj[!unphased, ] %*% mat.collapse + ## aadj.unphph = t(mat.collapse) %*% aadj[, which(!unphased)] + + ## aadj.final = rbind( + ## cbind(aadj.phph, aadj.phunph), + ## cbind(aadj.unphph, aadj.unphunph) + ## ) + + ## asegstats.unphased = asegstats[match(unphased.parents, asegstats$parent)] + ## asegstats.unphased$cn = jab$segstats$cn[asegstats.unphased$parent] + ## asegstats.final = c(asegstats[!unphased], asegstats.unphased) + ## asegstats.final$phased = c(rep(T, sum(!unphased)), rep(F, length(asegstats.unphased))) + ## asegstats.final$type[!asegstats.final$phased] = 'total' + + ## tmp.str = gr.string(gr.stripstrand(asegstats), mb = F, other.cols = 'type'); + ## asegstats$tile.id = as.integer(factor(tmp.str, unique(tmp.str))) + + ## ix = order(asegstats.final) + ## asegstats.final = asegstats.final[ix] + ## aadj.final = aadj.final[ix, ix] + + ## if (verbose) + ## message('Annotating allelic vertices') + + ## tmp.string = gr.string(asegstats, mb = F, other.cols = 'type'); tmp.string2 = gr.string(gr.flipstrand(asegstats), mb = F, other.cols = 'type') + ## asegstats$flip.ix = match(tmp.string, tmp.string2) + ## asegstats$phased = !unphased + + ## asegstats.final$edges.in = sapply(seq_along(asegstats.final), + ## function(x) {ix = Matrix::which(aadj.final[,x]!=0); paste(ix, '(', aadj.final[ix,x], ')', '->', sep = '', collapse = ',')}) + ## asegstats.final$edges.out = sapply(seq_along(asegstats.final), + ## function(x) {ix = Matrix::which(aadj.final[x, ]!=0); paste('->', ix, '(', aadj.final[x,ix], ')', sep = '', collapse = ',')}) + + ## asegstats.final$slack.in = asegstats.final$cn - Matrix::colSums(aadj.final) + ## asegstats.final$slack.out = asegstats.final$cn - Matrix::rowSums(aadj.final) + + ## asegstats.final$new.ind = asegstats.final$phased.out = asegstats.final$phased.in = asegstats.final$id = NULL + ## asegstats.final$tile.id = as.integer(factor(gr.string(gr.stripstrand(asegstats.final), mb = F, other.cols = 'type'))) + + ## m = sparseMatrix(seq_along(asegstats.final), asegstats.final$parent, x = 1); + + ## hh = rep(het.sites[, c()], 2) + ## hh$count = c(het.sites$high.count, het.sites$low.count) + ## hh$type = rep(c('high', 'low'), each = length(het.sites)) + + ## hh$ywid = 0.5 + ## atd = c( + ## gTrack(hh, angle = 0, y.field = 'count', y0 = 0, + ## colormaps = list(type = c('high' = alpha('red', 0.3), 'low' = alpha('blue', 0.3))), name = 'hets', y.quantile = 0.001, lwd.border = 2), + ## gTrack(asegstats.final, angle = 0, y.field = 'cn', y0 = 0, + ## colormaps = list(type = c('high' = alpha('red', 0.3), 'low' = alpha('blue', 0.3), 'total' = alpha('purple', 0.3))), name = 'alleles') + ## ) + + ## out = list( + ## segstats = jab$segstats, + ## asegstats = asegstats.final, + ## atd = atd, + ## agtrack = atd, + ## aadj = aadj.final, + ## ab.ix = Matrix::which((m %*% adj.ab %*% t(m))!=0, arr.ind = T), + ## ref.ix = Matrix::which((m %*% adj.ref %*% t(m))!=0, arr.ind = T) + ## ) + + return(jab) + } + + #' @name grab.hets + #' @title grab.hets + #' + #' @description + #' + #' returns allele gtrack given sites.txt from het pileup + #' + #' @param agt.fname (character) path to sites.txt + #' @param min.frac (numeric) between 0 and 1, min frequency in normal to count as het site + #' @param max.frac (numeric) between 0 and 1, max frequency in normal to count as het site + #' + #' @return allele gTrack + grab.hets = function(agt.fname = NULL, + min.frac = 0.2, + max.frac = 0.8) + { + if (is.null(agt.fname) || !file.exists(agt.fname)) { + stop("agt.fname does not exist") + } + + ## prepare and filter + agt.dt = fread(agt.fname)[alt.frac.n > min.frac & alt.frac.n < max.frac,] + ## add major and minor + agt.dt[, which.major := ifelse(alt.count.t > ref.count.t, "alt", "ref")] + agt.dt[, major.count := ifelse(which.major == "alt", alt.count.t, ref.count.t)] + agt.dt[, minor.count := ifelse(which.major == "alt", ref.count.t, alt.count.t)] + + ## melt the data frame + agt.melted = rbind(agt.dt[, .(seqnames, start, end, count = major.count, allele = "major")], + agt.dt[, .(seqnames, start, end, count = minor.count, allele = "minor")] + ) + + ## make GRanges + agt.gr = dt2gr(agt.melted[, .(seqnames, start, end, count, allele)]) + + return (agt.gr) + } + + #' @name grab.hets.from.maf + #' @title grab.hets.from.maf + #' + #' @description + #' + #' get hets into format needed by phased.binstats from HMF maf_approx + #' + #' @param agt.fname (character) + #' @param min.frac (numeric) + #' + #' @param GRanges + grab.hets.from.maf = function(agt.fname, min.frac = 0.2) { + + if (!file.exists(agt.fname)) { + stop("invalid file") + } + + gr = readRDS(agt.fname) + + if (!inherits(gr, "GRanges")) { + stop("rds file must contain GRanges object.") + } + + dt = as.data.table(gr) + dt[, major.count := ifelse(alt.count.t > ref.count.t, alt.count.t, ref.count.t)] + dt[, minor.count := ifelse(alt.count.t > ref.count.t, ref.count.t, alt.count.t)] + + ## out.dt = rbind(dt[, .(seqnames, start, end, count = major.count, allele = "major")], + ## dt[, .(seqnames, start, end, count = minor.count, allele = "minor")]) + + ## return(dt2gr(out.dt)) + return(dt) + } + + ## end of utils.R message("Reading input!") @@ -266,6 +836,1795 @@ message("Starting balance") + # override gGnome balance with zc_dev gGnome balance function) + + #' @name balance + #' @title balance gGnome graphs + #' @description + #' + #' Here we analyze gGraphs with "cn" (copy number) field to enforce integer + #' cn and junction balance, ie sum of incoming (or outgoing) edge + #' cn should be equal to node copy cn. + #' + #' The goal is to find a balaned assignment of "cn" to the nodes and edges of the gGraph + #' that maximally resemble the input weights while minimizing the loose end penalty. + #' The similarity / distance function can be weighted by optional node / edge + #' metadata field $weight (when weighted = TRUE). + #' + #' To output this gGraph, we design a MIP with + #' (1) objective function that minimizes (weighted sum of square) distance of fit node and junction copy number to provided values in + #' $cn field + #' (2) and lambda* the sum of copy number at non terminal loose ends subject to + #' (3) junction balance constraint + #' (4) fixing copy number of a subset of nodes and junctions + #' + #' Objective weight can be modulated at nodes and edges with $weight metadata + #' field (default node weight is node width, and edge weight is 1). + #' These fields will then set the penalty incurred to a fit of x to that node / edge + #' with copy number c and weight w as (x-c)^2/w. + #' + #' Lambda can be modulated at nodes with $lambda node metadata field (default 1) + #' + #' For "haplographs" ie graphs that have more than one node overlapping a given location, it may + #' be important to constrain total copy number using a haploid read depth signal. + #' The marginal parameter enables this through a GRanges annotated with $cn and optionally $weight + #' field that provides a target total copy number that the optimization will attempt to satisfy. + #' This provided copy number c and weight w (default 1) will be evaluated against the + #' sum s of the fit copy numbers of all nodes overlapping that location by adding a penalty + #' of (c-s)^2/w to the corresponding solution. marginal can also have an optional logical field + #' $fix that will actually constrain the marginal copy number to be equal to the provided value + #' (note: that the optimization may be infeasible, and function will error out) + #' + #' Additional controls can be inputted by changing the graph metadata - e.g. adding fields + #' $lb and $ub to nodes and edges will constrain their fit copy number to those bounds. + #' Adding $reward field to edges will add a reward for each copy of that edge in the solution. + #' + #' + #' @param gg gGraph with field $cn, can be NA for some nodes and edges, optional field $weight which will adjust the quadratic penalty on the fit to x as (x-$cn)^2/weight + #' @param lambda positive number specifying loose end penalty, note if gg$node metadata contain $lambda field then this lambda will be multiplied by the node level lambda (default 10) + #' @param marginal GRanges with field $cn and optional $weight field will be used to fit the summed values at each base of the genome to optimally fit the marginal value, optional field $fix will actually constrain the marginal to be the provided value + #' @param emarginal Junctions object with marginal CN in the $cn field (and optionally $weight in the weight field). optional field $fix will actually constrain the marginal to be the provided value. + #' @param tight indices or epxression on node metadata specifying at which nodes to disallow loose ensd + #' @param nfix indices or expression on node metadata specifying which node cn to fix + #' @param efix indices or expression on edge metadata specifying which edge cn to fix + #' @param nrelax indices or expression on node metadata specifying which nodes cn to relax + #' @param erelax indices or expression on edge metadata specifying which edges cn to relax + #' @param L0 flag whether to apply loose end penalty as L1 (TRUE) + #' @param loose.collapse (parameter only relevant if L0 = TRUE) will count all unique (by coordinate) instances of loose ends in the graph as the loose end penalty, rather than each instance alone ... useful for fitting a metagenome graph (FALSE) + #' @param phased (logical) indicates whether to run phased/unphased. default = FALSE + #' @param ism (logical) additional ISM constraints (FALSE) + #' @param force.major (logical) force major allele CN to be >= minor allele CN (default FALSE) + #' @param force.alt (logical) force incorporation of ALT edges, only applicable for phasing (default TRUE) + #' @param cnloh (logical) allow CN LOH? only relevant if phasing = TRUE. default FALSE. + #' @param lp (logical) solve as linear program using abs value (default TRUE) + #' @param M (numeric) big M constraint for L0 norm loose end penalty (default 1e3) + #' @param verbose (integer)scalar specifying whether to do verbose output, value 2 will spit out MIP (1) + #' @param tilim (numeric) time limit on MIP in seconds (10) + #' @param epgap (numeric) relative optimality gap threshhold between 0 and 1 (default 1e-3) + + #' @param trelim (numeric) max size of uncompressed tree in MB (default 32e3) + #' @param nodefileind (numeric) one of 0 (no node file) 1 (in memory compressed) 2 (on disk uncompressed) 3 (on disk compressed) default 1 + #' @param debug (logical) returns list with names gg and sol. sol contains full RCPLEX solution. (default FALSE) + #' @param force.haplotypes (logical) default TRUE + #' @param max.span (numeric) the maximum span of an edge below which both endpoints must be on the same parental haplotype default 1e9 + #' @param use.gurobi (logical) use gurobi optimizer? if TRUE uses gurobi instead of cplex. default FALSE. + #' @param nonintegral (logical) run without integer constraints on REF edges and nodes? default FALSE. + #' + #' @return balanced gGraph maximally resembling input gg in CN while minimizing loose end penalty lambda. + #' @author Marcin Imielinski + #' + #' @export + + balance = function(gg, + lambda = 10, + marginal = NULL, + emarginal = NULL, + tight = NULL, + nfix = NULL, efix = NULL, nrelax = NULL, erelax = NULL, + L0 = TRUE, + loose.collapse = FALSE, + M = 1e3, + phased = FALSE, + ism = TRUE, + force.major = TRUE, + force.alt = FALSE, + cnloh = FALSE, + lp = TRUE, + verbose = 1, + tilim = 10, + trelim = 32e3, + nodefileind = 1, + epgap = 1e-3, + max.span = 1e6, ## max span in bp + debug = FALSE, + use.gurobi = FALSE, + nonintegral = FALSE) + { + if (verbose) { + message("creating copy of input gGraph") + } + + if (use.gurobi) { + if (!requireNamespace("gurobi", quietly = TRUE)) { + stop("use.gurobi is TRUE but gurobi is not installed") + } + } + + gg = gg$copy + + if (verbose) { + message("Checking inputs") + } + + if (nodefileind) { + if (!(nodefileind %in% c(0,1,2,3))) { + warning("Invalid choice for nodefileind, resetting to default 1") + nodefileind = 1 + } + } + nodefileind = as.integer(nodefileind) + + if (ism) { + if (!L0) { + stop("ISM can only be set to true if using L0 penalty") + } + } + + if (!('cn' %in% names(gg$nodes$dt))) + { + warning('cn field not defined on nodes, setting to NA') + gg$nodes$mark(cn = NA_real_) + } + + if (!('cn' %in% names(gg$edges$dt))) + { + warning('cn not defined on edges, providing NA') + gg$edges$mark(cn = NA_real_) + } + + if (phased) { + if (!("allele" %in% names(gg$nodes$dt))) { + stop("cannot run phased balance without $allele field in nodes") + } + } + + if (!is.null(marginal)) { + if (!inherits(marginal, 'GRanges') || is.null(marginal$cn)) { + stop('marginal must be a GRanges with field $cn') + } + if (is.null(marginal$fix)) { + if (verbose) { + message("$fix not supplied. marginals not fixed by default.") + } + marginal$fix = 0 + } + if (is.null(marginal$weight)) { + if (verbose) { + message("$weight not supplied. set to range width in Mbp by default.") + } + marginal$weight = width(marginal) + } + } + + if (!is.null(emarginal)) { + if (!inherits(emarginal, 'Junction') || is.null(emarginal$dt$cn)) { + stop('emarginal must be Junction with field $cn') + } + ## don't mutate? + ## emarginal = emarginal$copy + if (is.null(emarginal$dt$fix)) { + if (verbose) { + message('$fix not supplied in emarginal. not fixed by default') + } + emarginal$set(fix = 0) + } + if (is.null(emarginal$dt$weight)) { + if (verbose) { + message("$weight not supplied in emarginal. set to 1 by default") + } + emarginal$set(weight = 1) + } + } + + ## default local lambda: default local lambda is 1 for consistency with JaBbA + if (!('lambda' %in% names(gg$nodes$dt))) + gg$nodes$mark(lambda = 1) + + + ## default node weight is its width + if (!('weight' %in% names(gg$nodes$dt))) + { + gg$nodes$mark(weight = width(gg$nodes$gr)) + } + + ## default edge weight is its width + if (!('weight' %in% names(gg$edges$dt))) + { + gg$edges$mark(weight = 1) + } + + ## default reward is 0 + if (!('reward' %in% names(gg$edges$dt))) + { + gg$edges$mark(reward = 0) + } + + ## handle parsing of efix, nfix, nrelax, erelax + if (!any(deparse(substitute(nfix)) == "NULL")) ## R voodo to allow "with" style evaluation + nfix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nfix)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) + + if (!any(deparse(substitute(nrelax)) == "NULL")) ## R voodo to allow "with" style evaluation + nrelax = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(nrelax)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) + + if (!any(deparse(substitute(efix)) == "NULL")) ## R voodo to allow "with" style evaluation + efix = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(efix)))), parent.frame()), gg$edges$dt, parent.frame(2)), error = function(e) NULL) + + if (!any(deparse(substitute(erelax)) == "NULL")) ## R voodo to allow "with" style evaluation + erelax = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(erelax)))), parent.frame()), gg$edges$dt, parent.frame(2)), error = function(e) NULL) + + if (!any(deparse(substitute(tight)) == "NULL")) ## R voodo to allow "with" style evaluation + tight = tryCatch(eval(eval(parse(text = substitute(deparse(substitute(tight)))), parent.frame()), gg$nodes$dt, parent.frame(2)), error = function(e) NULL) + + + if (is.logical(nfix)) + nfix = which(nfix) + + if (is.logical(efix)) + efix = which(efix) + + if (is.logical(nrelax)) + nrelax = which(nrelax) + + if (is.logical(erelax)) + erelax = which(erelax) + + if (length(nfix) & verbose) + message('Fixing ', length(nfix), ' nodes') + + if (length(efix) & verbose) + message('Fixing ', length(efix), ' edges') + + if (length(nrelax) & verbose) + message('Relaxing ', length(nrelax), ' nodes') + + gg$nodes[nrelax]$mark(weight = 0) + + if (length(erelax) & verbose) + message('Relaxing ', length(erelax), ' edges') + + gg$nodes[erelax]$mark(weight = 0) + + if (!is.logical(tight)) + tight = 1:length(gg$nodes) %in% tight + + if (any(tight) & verbose) + message('Leaving ', sum(tight), ' nodes tight') + + gg$nodes$mark(tight = tight) + + if (is.null(gg$nodes$dt$lb)) + gg$nodes$mark(lb = 0) + + if (is.null(gg$nodes$dt$ub)) + gg$nodes$mark(ub = Inf) + + if (is.null(gg$edges$dt$lb)) + gg$edges$mark(lb = 0) + + if (is.null(gg$edges$dt$ub)) + gg$edges$mark(ub = Inf) + + if (loose.collapse) + { + if (verbose) + message('Collapsing loose ends') + + uleft = unique(gr.start(gg$nodes$gr)) + uright = unique(gr.end(gg$nodes$gr)) + + gg$nodes$mark(loose.left.id = paste0(gr.match(gr.start(gg$nodes$gr), uleft), 'l')) + gg$nodes$mark(loose.right.id = paste0(gr.match(gr.end(gg$nodes$gr), uright), 'r')) + } + else + { + gg$nodes$mark(loose.left.id = paste0(1:length(gg$nodes), 'l')) + gg$nodes$mark(loose.right.id = paste0(1:length(gg$nodes), 'r')) + } + + ######## + ## VARIABLES + ######## + + ## create state space, keeping track of graph ids + vars = rbind( + gg$dt[, .(cn, snode.id, lb, ub, weight, gid = index, type = 'node', vtype = 'I')], + gg$sedgesdt[, .(from, to, lb, ub, sedge.id, cn, reward, + gid = sedge.id, type = 'edge', vtype = 'I', + span = gg$junctions$span[match(edge.id, gg$junctions$dt[, edge.id])])], + ## for loose ends lid marks all "unique" loose ends (which if loose.collapse = TRUE + ## will be defined on the basis of coordinate overlap) + gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, + ulid = paste0(index, 'i'), + lid = ifelse(strand == '+', loose.left.id, paste0('-', loose.right.id)), + type = 'loose.in', vtype = 'I')], ## incoming loose ends + gg$dt[tight == FALSE, .(cn = NA, snode.id, lambda, gid = index, + ulid = paste0(index, 'o'), + lid = ifelse(strand == '+', loose.right.id, paste0('-', loose.left.id)), + type = 'loose.out', vtype = 'I')], ## outgoing loose ends + gg$dt[tight == FALSE, .(gid = index, cn, + weight, type = 'nresidual', + vtype = 'C')], ## node residual + gg$sedgesdt[, .(gid = sedge.id, cn, + weight, type = 'eresidual', + vtype = 'C')], ## edge residual + fill = TRUE) + + + ## add "slush" variables - there will be one per chromosome + if (nonintegral) { + + if (verbose) { message("Adding slush variables") } + ## grab standard chromosomes from gGraph + chr.names = grep("^(chr)*[0-9XY]+$", as.character(seqlevels(gg)), value = TRUE) + slush.dt = data.table(chr = chr.names, + lb = -0.49999, + ub = 0.49999, + type = "slush", + vtype = "C", + gid = 1:length(chr.names)) + + vars = rbind(vars, slush.dt, fill = TRUE) + + vars[type == "node", chr := as.character(gg$dt$seqnames)[match(snode.id, gg$dt$snode.id)]] + vars[!(chr %in% slush.dt[, chr]), chr := NA_character_] + + } + + if (L0) + { + ## loose ends are labeled with lid and ulid, lid is only relevant if loose.collapse is true + ## (i.e. we need indicator.sum and indicator.sum.indicator + if (verbose) { + message("adding l0 penalty indicator") + } + + vars = rbind(vars, + rbind( + vars[type == 'loose.in', ][ , type := 'loose.in.indicator'][, vtype := 'B'][, gid := lid], + vars[type == 'loose.out', ][ , type := 'loose.out.indicator'][, vtype := 'B'][, gid := lid] + )) + + if (loose.collapse) + { + ## sum will sum all the loose ends assocaited with the same lid + vars = rbind(vars, + unique(rbind( + vars[type == 'loose.in', ][ , type := 'loose.in.indicator.sum'][, vtype := 'I'][, gid := lid], + vars[type == 'loose.out', ][ , type := 'loose.out.indicator.sum'][, vtype := 'I'][, gid := lid] + ), by = 'gid')) + + ## sum.indicator is an binary indicator on the sum + vars = rbind(vars, + rbind( + vars[type == 'loose.in.indicator.sum', ][ , type := 'loose.in.indicator.sum.indicator'][, vtype := 'B'][, gid := lid], + vars[type == 'loose.out.indicator.sum', ][ , type := 'loose.out.indicator.sum.indicator'][, vtype := 'B'][, gid := lid] + )) + } + } + + if (!is.null(marginal)) { + ## first disjoin marginal against the nodes + ## ie wee ned to create a separate residual variable for every unique + ## disjoint overlap of marginal with the nodes + dmarginal = gg$nodes$gr %>% gr.stripstrand %*% grbind(marginal %>% gr.stripstrand) %>% + disjoin %$% marginal[, c('cn', 'weight', 'fix')] %Q% + (!is.na(cn)) %Q% (!is.na(weight)) %Q% (!is.infinite(weight)) + + vars = rbind(vars, + gr2dt(dmarginal)[, .(cn, weight, mfix = fix>0, + rid = 1:.N, type = 'mresidual', vtype = 'C')][, gid := rid], + fill = TRUE + ) + } + + + if (!is.null(emarginal)) { + ## we need to identify which junction in the marginal each junction in the phased graph corresponds to + junction.map = merge.Junction( + phased = gg$junctions[, c()], + emarginal = emarginal[, c("cn", "weight", "fix")], + cartesian = TRUE, + all.x = TRUE)$dt + ## match this back with edge id and add this to vars + ## vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), seen.by.emarginal]] + vars[type == "edge", emarginal.id := junction.map[abs(sedge.id), subject.id]] + ## add weight and target total CN + emarginal = merge.data.table(unique( + vars[type == "edge" & !is.na(emarginal.id),][, type := "emresidual"][, .(emarginal.id, sedge.id, lb = -M, ub = M, gid, type, vtype = "C", from, to)], + by = "emarginal.id"), + junction.map[, .(subject.id, weight, cn, fix)], + by.x = "emarginal.id", + by.y = "subject.id") + vars = rbind(vars, emarginal, fill = TRUE) + } + + if (lp) { + ## need delta plus and delta minus for nodes and edges + delta.node = gg$dt[tight == FALSE, .(gid = index, cn, weight, vtype = 'C')] + delta.edge = gg$sedgesdt[, .(gid = sedge.id, cn, weight, reward, vtype = 'C')] + + deltas = rbind( + delta.node[, .(gid, weight, vtype, type = "ndelta.plus")], + delta.node[, .(gid, weight, vtype, type = "ndelta.minus")], + delta.edge[, .(gid, weight, vtype, type = "edelta.plus")], + delta.edge[, .(gid, weight, vtype, type = "edelta.minus")] + ) + + vars = rbind( + vars, + deltas, + fill = TRUE + ) + + ## add deltas for marginals if marginals are supplied + if (!is.null(marginal)) { + mdeltas = rbind( + vars[type == "mresidual", .(rid, weight, vtype, type = "mdelta.plus")][, gid := rid], + vars[type == "mresidual", .(rid, weight, vtype, type = "mdelta.minus")][, gid := rid] + ) + vars = rbind(vars, mdeltas, fill = TRUE) + } + + ## add deltas for emresiduals if emarginals are supplied + if (!is.null(emarginal)) { + emdeltas = rbind( + vars[type == "emresidual", .(emarginal.id, weight, vtype, type = "emdelta.plus")][, gid := emarginal.id], + vars[type == "emresidual", .(emarginal.id, weight, vtype, type = "emdelta.minus")][, gid := emarginal.id] + ) + vars = rbind(vars, emdeltas, fill = TRUE) + } + } + + ## moved from being ISM-specific annotation as this is more generally useful information + ## add telomeric annotation + qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips + term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome + -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end + term.out = -term.in ## out is reciprocal of in + + ## annotate loose indicators with this + vars[!is.na(snode.id), telomeric := ifelse(snode.id %in% term.in | snode.id %in% term.out, + TRUE, + FALSE)] + + + + if (phased) { + ## add allele information and og.node.id + node.match = match(vars[, snode.id], gg$dt$snode.id) + vars[, ":="(allele = gg$dt$allele[node.match], + og.node.id = gg$dt$og.node.id[node.match])] + + + + + ## add ref/alt information and og.edge.id + edge.match = match(vars[, sedge.id], gg$sedgesdt$sedge.id) + gg$edges$mark(span = gg$junctions$span) + + vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match], + connection = gg$sedgesdt$connection[edge.match], + og.edge.id = gg$sedgesdt$og.edge.id[edge.match], + span = gg$sedgesdt$span[edge.match], ## NEED SPAN + n1 = gg$dt$snode.id[gg$sedgesdt$from[edge.match]], + n2 = gg$dt$snode.id[gg$sedgesdt$to[edge.match]])] + + vars[, n1.side := ifelse(n1 > 0, "right", "left")] + vars[, n2.side := ifelse(n2 > 0, "left", "right")] + vars[, n1 := abs(n1)] + vars[, n2 := abs(n2)] + + edge.indicator.vars = vars[type == "edge"][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id] + vars = rbind(vars, edge.indicator.vars, fill = TRUE) + + #' zchoo Thursday, Sep 02, 2021 05:30:53 PM + #' moved to earlier + ## REF edge configuration constraint (added by default basically) + ## only add this if there are no unphased nodes + if (cnloh) { + + ## if allow CNLOH, the sum of edge indicators corresponding with og edge id is LEQ 2 + ## this is only allowed in constant CN regions and if breakpoint is not shared with any ALT edges + + ## penalize CNLOH edges + + if (!is.null(gg$edges$dt$cnloh)) { + cnloh.edges = gg$edges$dt[cnloh == TRUE & type == "ALT", edge.id] %>% unique + if (verbose) { + message("Number of marked CNLOH edges: ", length(cnloh.edges)) + } + + ## add CNLOH annotation to variables + ## browser() + vars[, cnloh := FALSE] + vars[(type == "edge.indicator" | type == "edge" | type == "eresidual") & + ref.or.alt == "ALT" & (abs(sedge.id) %in% cnloh.edges) & (sedge.id > 0), + ":="(cnloh = TRUE)] + + } else { + warning("CNLOH not specified on edges. Disallowing!") + cnloh.og.edges = c() + vars[, cnloh := FALSE] + } + } else { + + cnloh.og.edges = c() + vars[, cnloh := FALSE] + + } + + ## add node haplotype indicators + ## these are binary indicators that determine whether a node belongs to H1 + ## only constrain positive-stranded nodes due to skew symmetry + haplotype.indicators = gg$dt[(allele == "major" | allele == "minor") & snode.id > 0, + .(cn, snode.id, lb, ub, weight, og.node.id, + allele, gid = index, type = 'haplotype', vtype = 'B')] + vars = rbind(vars, haplotype.indicators, fill = TRUE) + + ## add H1 and H2 'AND' indicators which should have n1/n1.side/n2/n2.side metadata + ## only add these for low-span edges + ## and where CNLOH is FALSE + h1.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross") & span < max.span & cnloh == FALSE,][, vtype := "B"][, type := "h1.and.indicator"][, gid := sedge.id] + h2.and.indicators = vars[sedge.id > 0 & type == "edge" & (connection == "straight" | connection == "cross") & span < max.span & cnloh == FALSE,][, vtype := "B"][, type := "h2.and.indicator"][, gid := sedge.id] + + vars = rbind(vars, h1.and.indicators, h2.and.indicators, fill = TRUE) + + } + + + ## if we want to implement edge reward or ISM, we need to add edge indicators + ## these are binary variables that allow us to reward/penalize L0 norms + ##if (TRUE) { ##(ism | any(gg$edges$dt$reward != 0, na.rm = TRUE)) { + ## if not phased, must add edge indicators (for just the ALT edges) + if (!phased) { + edge.match = match(vars[, sedge.id], gg$sedgesdt$sedge.id) + vars[, ":="(ref.or.alt = gg$sedgesdt$type[edge.match])] ## need ref.or.alt information + edge.indicator.vars = vars[type == "edge" & ref.or.alt == "ALT"][, type := "edge.indicator"][, vtype := "B"][, gid := sedge.id] + vars = rbind(vars, edge.indicator.vars, fill = TRUE) + } + + ## loose indicators are only required if running with ISM + if (ism) { + vars[type == "loose.in.indicator" & sign(snode.id) == 1, ee.id := paste(snode.id, "left")] + vars[type == "loose.out.indicator" & sign(snode.id) == 1, ee.id := paste(snode.id, "right")] + } + + ## but even without ISM, if we want to add reward, we must add edge indicators + vars[type == "edge.indicator" & sign(sedge.id) == 1 & ref.or.alt == "ALT", + ":="(ee.id.n1 = paste(gg$edges$dt$n1[match(sedge.id, gg$edges$dt$sedge.id)], + gg$edges$dt$n1.side[match(sedge.id, gg$edges$dt$sedge.id)]), + ee.id.n2 = paste(gg$edges$dt$n2[match(sedge.id, gg$edges$dt$sedge.id)], + gg$edges$dt$n2.side[match(sedge.id, gg$edges$dt$sedge.id)]))] + + if (phased) { + ## homologous extremity exclusivity (only for phased graphs) + ## get stranded breakpoint ID's associated with the start and end of each node + + ## number of unique starts should be equal to number of snodes in the original unphased graph + ## aka 2 * number of og edge ids + + vars[type == "loose.in.indicator", hee.id := paste(og.node.id, "in")] + vars[type == "loose.out.indicator", hee.id := paste(og.node.id, "out")] + + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + ":="(og.n1 = gg$dt$og.node.id[from], + og.n1.side = gg$edges$dt$n1.side[match(abs(sedge.id), gg$edges$dt$edge.id)], + og.n2 = gg$dt$og.node.id[to], + og.n2.side = gg$edges$dt$n2.side[match(abs(sedge.id), gg$edges$dt$edge.id)])] + + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + ":="(hee.id.n1 = ifelse(og.n1.side == "left", + paste(og.n1, "in"), + paste(og.n1, "out")), + hee.id.n2 = ifelse(og.n2.side == "left", + paste(og.n2, "in"), + paste(og.n2, "out")))] + + ## reciprocal homologous extremity exclusivity + ## implement config indicators. there is one per og.edge.id per configuration + straight.config = unique(vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, ][, type := "straight.config"][, config.id := paste("straight", og.edge.id)], by = "og.edge.id") + cross.config = unique(vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, ][, type := "cross.config"][, config.id := paste("cross", og.edge.id)], by = "og.edge.id") + + ## add these to vars + vars = rbind(vars, straight.config, cross.config, fill = TRUE) + + ## add straight/cross to REF edges + vars[type == "edge.indicator" & ref.or.alt == "REF", + connection := gg$sedgesdt$connection[match(sedge.id, gg$sedgesdt$sedge.id)]] + + ## add config ID's to corresponding edge indicators + vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, + config.id := paste(connection, og.edge.id)] + + + ## add config ID's to corresponding edge indicators + vars[type == "edge.indicator" & ref.or.alt == "REF" & sedge.id > 0, + config.id := paste(connection, og.edge.id)] + + ## add straight edge id e.g. for each n1 and n2, add the sedge.id of the corresponding straight edge + straight.sedges = gg$edges$dt[type == "REF" & connection == "straight" & sedge.id > 0, + .(n1.full = paste(n1, n1.side), n2.full = paste(n2, n2.side), sedge.id)] + cross.sedges = gg$edges$dt[type == "REF" & connection == "cross" & sedge.id > 0, + .(n1.full = paste(n1, n1.side), n2.full = paste(n2, n2.side), sedge.id)] + + + ## pull alt edges from sedgesdt + alt.sedges = gg$edges$dt[type == "ALT" & sedge.id > 0, + .(n1.full = paste(n1, n1.side), n2.full = paste(n2, n2.side), sedge.id)] + + alt.sedges[, ":="(s1 = straight.sedges$sedge.id[match(n1.full, straight.sedges$n1.full)], + s2 = straight.sedges$sedge.id[match(n2.full, straight.sedges$n1.full)], + s3 = straight.sedges$sedge.id[match(n1.full, straight.sedges$n2.full)], + s4 = straight.sedges$sedge.id[match(n2.full, straight.sedges$n2.full)])] + + alt.sedges[, ":="(c1 = cross.sedges$sedge.id[match(n1.full, cross.sedges$n1.full)], + c2 = cross.sedges$sedge.id[match(n2.full, cross.sedges$n1.full)], + c3 = cross.sedges$sedge.id[match(n1.full, cross.sedges$n2.full)], + c4 = cross.sedges$sedge.id[match(n2.full, cross.sedges$n2.full)])] + + ## pull loose ends + vars[type == "loose.in.indicator" & snode.id > 0, n2.full := paste(snode.id, "left")] + vars[type == "loose.out.indicator" & snode.id > 0, n1.full := paste(snode.id, "right")] + + ## merge sedge.id + vars[type == "loose.in.indicator" & snode.id > 0, ":="(s = straight.sedges$sedge.id[match(n2.full, straight.sedges$n2.full)])] + vars[type == "loose.out.indicator" & snode.id > 0, ":="(s = straight.sedges$sedge.id[match(n1.full, straight.sedges$n1.full)])] + + vars[type == "loose.in.indicator" & snode.id > 0, ":="(c = cross.sedges$sedge.id[match(n2.full, cross.sedges$n2.full)])] + vars[type == "loose.out.indicator" & snode.id > 0, ":="(c = cross.sedges$sedge.id[match(n1.full, cross.sedges$n1.full)])] + + + ## merge this info into vars + vars = merge(vars, + alt.sedges[, .(sedge.id, s1, s2, s3, s4, c1, c2, c3, c4)], + by = "sedge.id", + all.x = TRUE, + all.y = FALSE) + + } + + vars[, id := 1:.N] ## set id in the optimization + vars[is.na(lb), lb := -Inf] + vars[is.na(ub), ub := Inf] + vars[, relax := FALSE][, fix := FALSE] + if ("mresidual" %in% vars$type) { + vars[type == 'mresidual' & mfix == TRUE, ":="(lb = 0, ub = 0)] + message("Number of fixed marginals: ", nrow(vars[type == 'mresidual' & mfix == TRUE,])) + } + if ("emresidual" %in% vars$type) { + vars[type == "emresidual" & fix == TRUE, ":="(lb = 0, ub = 0)] + } + + ## redo setting lb and ub + vars[type %in% c('node', 'edge') & is.na(lb), lb := 0] + vars[type %in% c('node', 'edge') & is.na(ub), ub := M] + vars[type %in% c('node', 'edge') & lb < 0, lb := 0] + vars[type %in% c('node', 'edge') & ub > M, ub := M] + ## vars[type %in% c('node', 'edge'), lb := ifelse(is.na(lb), 0, pmax(lb, 0, na.rm = TRUE)] + ## vars[type %in% c('node', 'edge'), ub := ifelse(is.na(ub), M, pmin(ub, M, na.rm = TRUE))] + vars[type %in% c('loose.in', 'loose.out'), ":="(lb = 0, ub = Inf)] + + ## reward shouldn't have to be positive + ## vars[type %in% c('edge'), reward := pmax(reward, 0, na.rm = TRUE)] + + + ## figure out junctions and nodes to fix + + vars[!is.na(cn) & type == 'node' & abs(snode.id) %in% nfix, ":="(lb = cn, ub = cn, fix = TRUE)] + vars[!is.na(cn) & type == 'edge' & abs(sedge.id) %in% efix, ":="(lb = cn, ub = cn, fix = TRUE)] + + ## figure out terminal node sides for in and out loose ends + ## these will not have loose ends penalized + qtips = gr.end(si2gr(seqlengths(gg$nodes))) ## location of q arm tips + term.in = c(which(start(gg$nodes$gr) == 1), ## beginning of chromosome + -which(gg$nodes$gr %^% qtips)) ## flip side of chromosome end + term.out = -term.in + vars$terminal = FALSE + vars[(type %in% c('loose.in', 'loose.in.indicator')) & (snode.id %in% term.in), terminal := TRUE] + vars[(type %in% c('loose.out', 'loose.out.indicator')) & (snode.id %in% term.out), terminal := TRUE] + + ## if not using integral constraints, + ## change the vtype of terminal loose ends, nodes, and REF edges + ## additionally relax their lower bound to -0.5 + if (nonintegral) { + vars[type == "loose.in" & (snode.id %in% term.in), vtype := "C"] + vars[type == "loose.out" & (snode.id %in% term.out), vtype := "C"] + vars[type == "loose.in" & (snode.id %in% term.in), lb := -0.4999] + vars[type == "loose.out" & (snode.id %in% term.out), lb := -0.4999] + vars[type == "edge" & ref.or.alt == "REF", vtype := "C"] + vars[type == "edge" & ref.or.alt == "REF", lb := -0.4999] + ## vars[type == "node", lb := -0.4999] + ## vars[type == "node", vtype := "C"] + } + + ## fix the heaviest node + ## browser() + ## maxn = vars[type == "node"][which.max(weight), snode.id] + ## vars[snode.id == maxn & type == "node", ":="(ub = (cn), lb = (cn))] + + ## browser() + ## table(vars[, .(type, vtype)]) + + ######## + ## CONSTRAINTS + ## the key principle behind this "melted" form of constraint building is the cid + ## (constraint id) which is the key that will group coefficients into constraints + ## when we finally build the matrices. So all we need to do is make sure that + ## that value / cid pairs make sense and that every cid has an entry in b + ######## + + ## we need one junction balance constraint per loose end + + ## constraints indexed by cid + if (nonintegral) { + + ## if running without integer constraints we have to include the slush variables + ## for each node + slush.sub = vars[type == "slush"] + node.slush.in = vars[type == "node", .(value = -1, + id = slush.sub$id[match(chr, slush.sub$chr)], + cid = paste("in", gid))] + node.slush.out = vars[type == "node", .(value = -1, + id = slush.sub$id[match(chr, slush.sub$chr)], + cid = paste("out", gid))] + + node.slush = rbind(node.slush.in, node.slush.out)[!is.na(id)] + + constraints = rbind( + node.slush, + vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('in', to))], + vars[type == 'node', .(value = -1, id, cid = paste('in', gid))], + vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('out', from))], + vars[type == 'node', .(value = -1, id, cid = paste('out', gid))], + fill = TRUE) + + + } else { + constraints = rbind( + vars[type == 'loose.in', .(value = 1, id, cid = paste('in', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('in', to))], + vars[type == 'node', .(value = -1, id, cid = paste('in', gid))], + vars[type == 'loose.out', .(value = 1, id, cid = paste('out', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('out', from))], + vars[type == 'node', .(value = -1, id, cid = paste('out', gid))], + fill = TRUE) + + } + + b = rbind( + vars[type == 'node', .(value = 0, sense = 'E', cid = paste('in', gid))], + vars[type == 'node', .(value = 0, sense = 'E', cid = paste('out', gid))], + fill = TRUE) + + ## add to the constraints the definitions of the node and edge + if (nonintegral) { + + ## if running without integer constraints we have to include the slush variables + ## for each node + slush.sub = vars[type == "slush"] + node.slush = vars[type == "node", .(value = 1, + id = slush.sub$id[match(chr, slush.sub$chr)], + cid = paste("nresidual", gid))] + constraints = rbind( + constraints, + rbind( + node.slush, + vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))], + vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))], + vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))], + fill = TRUE) + ) + } else { + constraints = rbind( + constraints, + rbind( + vars[type == 'node', .(value = 1, id, cid = paste('nresidual', gid))], + vars[type == 'nresidual', .(value = -1, id, cid = paste('nresidual', gid))], + vars[type == 'edge', .(value = 1, id, cid = paste('eresidual', gid))], + vars[type == 'eresidual', .(value = -1, id, cid = paste('eresidual', gid))], + fill = TRUE) + ) + } + + b = rbind(b, + vars[type == 'node', .(value = cn, sense = 'E', cid = paste('nresidual', gid))], + vars[type == 'edge', .(value = cn, sense = 'E', cid = paste('eresidual', gid))], + fill = TRUE) + + ## add the reverse complement equality constraints on nodes and edges + constraints = rbind( + constraints, + rbind( ## +1 coefficient for positive nodes, -1 for negative nodes, matched by abs (snode.id) + vars[type == 'node', .(value = sign(snode.id), id, cid = paste('nrc', abs(snode.id)))], + vars[type == 'edge', .(value = sign(sedge.id), id, cid = paste('erc', abs(sedge.id)))], + fill = TRUE) + ) + + b = rbind(b, + vars[type == 'node' & snode.id>0, .(value = 0, sense = 'E', cid = paste('nrc', abs(snode.id)))], + vars[type == 'edge' & sedge.id>0, .(value = 0, sense = 'E', cid = paste('erc', abs(sedge.id)))], + fill = TRUE) + + + ## if solving as LP, add deltas constraints (absolute value trick) + + if (lp) { + if (verbose) { + message("adding delta constraints for LP") + } + + vars[type %like% "delta.plus" | type %like% "delta.minus", ":="(ub = M, lb = 0)] + + ## add the residual constraints + ndelta.slack = rbind( + vars[type == "nresidual", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))], + vars[type == "ndelta.minus", .(value = -1, id, cid = paste("ndelta.minus.slack", gid))], + vars[type == "nresidual", .(value = 1, id, cid = paste("ndelta.plus.slack", gid))], + vars[type == "ndelta.plus", .(value = -1, id, cid = paste("ndelta.plus.slack", gid))] + ) + + ndelta.slack.rhs = rbind( + vars[type == "ndelta.minus", .(value = 0, sense = "L", cid = paste("ndelta.minus.slack", gid))], + vars[type == "ndelta.plus", .(value = 0, sense = "L", cid = paste("ndelta.plus.slack", gid))] + ) + + edelta.slack = rbind( + vars[type == "eresidual", .(value = -1, id, cid = paste("edelta.minus.slack", gid))], + vars[type == "edelta.minus", .(value = -1, id, cid = paste("edelta.minus.slack", gid))], + vars[type == "eresidual", .(value = 1, id, cid = paste("edelta.plus.slack", gid))], + vars[type == "edelta.plus", .(value = -1, id, cid = paste("edelta.plus.slack", gid))] + ) + + edelta.slack.rhs = rbind( + vars[type == "edelta.minus", .(value = 0, sense = "L", cid = paste("edelta.minus.slack", gid))], + vars[type == "edelta.plus", .(value = 0, sense = "L", cid = paste("edelta.plus.slack", gid))] + ) + + mdelta.slack = rbind( + vars[type == "mresidual", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))], + vars[type == "mdelta.minus", .(value = -1, id, cid = paste("mdelta.minus.slack", gid))], + vars[type == "mresidual", .(value = 1, id, cid = paste("mdelta.plus.slack", gid))], + vars[type == "mdelta.plus", .(value = -1, id, cid = paste("mdelta.plus.slack", gid))] + ) + + mdelta.slack.rhs = rbind( + vars[type == "mdelta.minus", .(value = 0, sense = "L", cid = paste("mdelta.minus.slack", gid))], + vars[type == "mdelta.plus", .(value = 0, sense = "L", cid = paste("mdelta.plus.slack", gid))] + ) + + constraints = rbind(constraints, ndelta.slack, edelta.slack, mdelta.slack, fill = TRUE) + b = rbind(b, ndelta.slack.rhs, edelta.slack.rhs, mdelta.slack.rhs, fill = TRUE) + + } + + if (phased) { + + ## add haplotype indicator constraints + ## e.g. the haplotype indicators corresponding to the same og node must add up to 1 + iconstraints = vars[type == "haplotype" & snode.id > 0, + .(value = 1, id, cid = paste("haplotype.node", og.node.id))] + rhs = unique(vars[type == "haplotype", + .(value = 1, sense = "E", cid = paste("haplotype.node", og.node.id))], + by = "cid") + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## check that there are two per og edge id + ## browser() + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[, count] == 2, na.rm = TRUE) + ## length(unique(rhs[, cid])) == length(unique(iconstraints[, cid])) + ## length(unique(rhs[, cid])) == length(unique(gg$nodes$dt[allele == "major", og.node.id])) + + ## add H1 AND constraint + h1.and.ids = merge.data.table(vars[type == "h1.and.indicator", .(n1, n2, edge.id = id, sedge.id)], + vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)], + by.x = "n1", + by.y = "n1.snode.id") %>% + merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)], + by.x = "n2", + by.y = "n2.snode.id") + + h2.and.ids = merge.data.table(vars[type == "h2.and.indicator", .(n1, n2, edge.id = id, sedge.id)], + vars[type == "haplotype", .(n1.snode.id = snode.id, n1.id = id)], + by.x = "n1", + by.y = "n1.snode.id") %>% + merge.data.table(vars[type == "haplotype", .(n2.snode.id = snode.id, n2.id = id)], + by.x = "n2", + by.y = "n2.snode.id") + + ## verify only + sedge id + ## browser() + + ## there are four constraints that are needed to implement this first edge constraint (c1-3) + iconstraints = rbind(h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c1", sedge.id))], + h1.and.ids[, .(value = -1, id = n1.id, cid = paste("h1.and.c1", sedge.id))], + h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c2", sedge.id))], + h1.and.ids[, .(value = -1, id = n2.id, cid = paste("h1.and.c2", sedge.id))], + h1.and.ids[, .(value = 1, id = edge.id, cid = paste("h1.and.c3", sedge.id))], + h1.and.ids[, .(value = -1, id = n1.id, cid = paste("h1.and.c3", sedge.id))], + h1.and.ids[, .(value = -1, id = n2.id, cid = paste("h1.and.c3", sedge.id))]) + + rhs = rbind(h1.and.ids[, .(value = 0, sense = "L", cid = paste("h1.and.c1", sedge.id))], + h1.and.ids[, .(value = 0, sense = "L", cid = paste("h1.and.c2", sedge.id))], + h1.and.ids[, .(value = -1, sense = "G", cid = paste("h1.and.c3", sedge.id))]) + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[cid %like% "c1", count] == 2) + ## all(tmp[cid %like% "c2", count] == 2) + ## all(tmp[cid %like% "c3", count] == 3) + + + iconstraints = rbind(h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, id = edge.id, cid = paste("h2.and.c3", sedge.id))], + h2.and.ids[, .(value = 1, id = n1.id, cid = paste("h2.and.c3", sedge.id))], + h2.and.ids[, .(value = 1, id = n2.id, cid = paste("h2.and.c3", sedge.id))]) + + rhs = unique(rbind(h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c1", sedge.id))], + h2.and.ids[, .(value = 1, sense = "L", cid = paste("h2.and.c2", sedge.id))], + h2.and.ids[, .(value = 1, sense = "G", cid = paste("h2.and.c3", sedge.id))]), + by = "cid") + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## verify that there are no weird NA's and that there is only one set of constraints per sedge.id + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[cid %like% "c1", count] == 2) + ## all(tmp[cid %like% "c2", count] == 2) + ## all(tmp[cid %like% "c3", count] == 3) + + ## connect edge indicators to the haplotype configuration of connected edges + iconstraints = rbind(vars[type == "h1.and.indicator", + .(value = -1, id, cid = paste("haplotype.indicator", sedge.id))], + vars[type == "h2.and.indicator", + .(value = -1, id, cid = paste("haplotype.indicator", sedge.id))], + vars[type == "edge.indicator" & + (sedge.id %in% vars[type == "h1.and.indicator",]$sedge.id), + .(value = 1, id, cid = paste("haplotype.indicator", sedge.id))]) + rhs = unique(iconstraints[, .(value = 0, sense = "L", cid)], by = "cid") + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## verify that there are three of these per sedge.id! + ## tmp = iconstraints[, .(count = .N), by = cid] + ## all(tmp[, count] == 3) + + ## add constraints that force indicators to be 1 if edge CN > 0 + + ## add constraints for upper bound (same setup as L0 penalty) - one per edge + iconstraints = vars[type == "edge", .(value = 1, id, + sedge.id, + cid = paste("edge.indicator.ub", sedge.id))] + + ## add matching indicator variables, matching by cid + iconstraints = rbind( + iconstraints, + vars[type == "edge.indicator", ][ + sedge.id %in% iconstraints$sedge.id, .(value = -M, id, cid = iconstraints$cid, sedge.id)], + fill = TRUE) + + ## upper bound is M if indicator is positive, and zero otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## add the RHS of this constraint (upper bound) + b = rbind( + b, + vars[type == "edge", .(value = 0, sense = "L", cid = paste("edge.indicator.ub", sedge.id))], + fill = TRUE + ) + + ## add constraints for the lower bound + iconstraints = vars[type == "edge", + .(value = 1, id, sedge.id, cid = paste("edge.indicator.lb", sedge.id))] + + ## add matching indicator variables for LB + iconstraints = rbind( + iconstraints, + vars[type == "edge.indicator", ][sedge.id %in% iconstraints$sedge.id, + .(value = -0.1, id, cid = iconstraints$cid, sedge.id)], + fill = TRUE) + + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## add the RHS of this constraint (upper bound) + b = rbind( + b, + vars[type == "edge", .(value = 0, sense = "G", cid = paste("edge.indicator.lb", sedge.id))], + fill = TRUE + ) + } + + ## implement edge indicators for ISM and edge reward + if (ism | any(gg$edges$dt$reward != 0, na.rm = TRUE)) { + + ## implement edge edge indicators if not already (e.g. if not doing phasing) + if (!phased) { + + ## importantly, we only want to add these for ALT edges + iconstraints = vars[type == "edge" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, + sedge.id, + cid = paste("edge.indicator.ub", sedge.id))] + + ## add matching indicator variables, matching by cid + iconstraints = rbind( + iconstraints, + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, ][ + sedge.id %in% iconstraints$sedge.id, + .(value = -M, id, cid = iconstraints$cid, sedge.id)], + fill = TRUE) + + ## upper bound is M if indicator is positive, and zero otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## add the RHS of this constraint (upper bound) + b = rbind( + b, + vars[type == "edge" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 0, sense = "L", cid = paste("edge.indicator.ub", sedge.id))], + fill = TRUE + ) + + ## add constraints for the lower bound + iconstraints = vars[type == "edge" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, sedge.id, cid = paste("edge.indicator.lb", sedge.id))] + + ## add matching indicator variables for LB + iconstraints = rbind( + iconstraints, + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, ][ + sedge.id %in% iconstraints$sedge.id, + .(value = -0.1, id, cid = iconstraints$cid, sedge.id)], + fill = TRUE) + + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## add the RHS of this constraint (upper bound) + b = rbind( + b, + vars[type == "edge" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 0, sense = "G", cid = paste("edge.indicator.lb", sedge.id))], + fill = TRUE + ) + } + + ## fix loose ends at zero if there's a junction there (only valid if not phasing) + #' zchoo Tuesday, Jun 15, 2021 11:53:15 AM + #' this constraint appears to be valid even if running phasing. + ## if (!phased) { + ## extremity exclusivity (relevant for ALL graphs) + + if (ism) { + loose.constraints = rbind( + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & telomeric == FALSE, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ) + + edge.constraints = rbind( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n1))], + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id) == 1, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id.n2))] + ) + + constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) + + loose.b = unique(loose.constraints[, .(cid, value = 1, sense = "L")], by = "cid") + edge.b = unique(edge.constraints[, .(cid, value = 1, sense = "L")], by = "cid") + + b = rbind(b, edge.b, loose.b, fill = TRUE) + + ## fix loose ends at zero if they coincide with a called junction + edge.ee.ids = unique(c(vars[type == "edge.indicator", ee.id.n1], vars[type == "edge.indicator", ee.id.n2])) + edge.ee.ids = edge.ee.ids[!is.na(edge.ee.ids)] + + loose.zeros = rbind( + vars[type == "loose.in.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))], + vars[type == "loose.out.indicator" & sign(snode.id) == 1 & ee.id %in% edge.ee.ids, + .(value = 1, id, cid = paste("extremity.exclusivity", ee.id))] + ) + + loose.zeros.rhs = unique(loose.zeros[, .(cid, value = 0, sense = "E")], by = "cid") + + constraints = rbind(constraints, loose.zeros, fill = TRUE) + b = rbind(b, loose.zeros.rhs, fill = TRUE) + } + } + + if (phased) { + ## homologous extremity exclusivity + ## this is actually redundant with previous constraints + + if (ism) { + loose.constraints = rbind( + vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))], + vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id))] + ) + + edge.constraints = rbind( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n1))], + vars[type == "edge.indicator" & ref.or.alt == "ALT" & sign(sedge.id)==1, + .(value = 1, id, cid = paste("homol.extremity.exclusivity", hee.id.n2))] + ) + + ## we should allow loose ends to violate this as some loose ends are germline + ## therefore, loose ends with id's not shared with edge constraints + loose.constraints = loose.constraints[(cid %in% edge.constraints[, cid])] + + ## add these constraints to the existing table + constraints = rbind(constraints, loose.constraints, edge.constraints, fill = TRUE) + + rhs = unique(rbind( + vars[type == "loose.in.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))], + vars[type == "loose.out.indicator" & sign(snode.id)==1 & telomeric == FALSE, + .(value = 1, sense = "L", cid = paste("homol.extremity.exclusivity", hee.id))] + ), by = "cid") + + rhs = rhs[cid %in% edge.constraints[, cid]] + + b = rbind(b, rhs, fill = TRUE) + + if (verbose) { + message("Number of homologous extremity exclusivity constraints: ", + nrow(rhs)) + } + + config.dt = vars[type == "straight.config" | type == "cross.config",] + + config.constraints.lt = rbind( + vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "REF", + .(value = -1, id, cid = paste("config lt", sedge.id))], + vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "REF", + .(value = 1, id = config.dt$id[match(config.id, config.dt$config.id)], + cid = paste("config lt", sedge.id))]) + + config.constraints.gt = rbind( + vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "REF", + .(value = -1, id, cid = config.id)], + vars[type == "straight.config" & sedge.id > 0 & ref.or.alt == "REF", + .(value = 1, id, cid = config.id)], + vars[type == "cross.config" & sedge.id > 0 & ref.or.alt == "REF", + .(value = 1, id, cid = config.id)]) + + rhs = unique(rbind( + config.constraints.lt[, .(cid, value = 0, sense = "G")], + config.constraints.gt[, .(cid, value = 0, sense = "L")]), + by = "cid") + + constraints = rbind(constraints, config.constraints.lt, config.constraints.gt, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + ## implement reciprocal homologous extremity exclusivity + straight.config.dt = vars[type == "straight.config",] + cross.config.dt = vars[type == "cross.config",] + + rhomol.constraints = rbind( + ## corresponding cross indicator + vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "REF" & connection == "straight", + .(value = 1, id = cross.config.dt$id[match(og.edge.id, cross.config.dt$og.edge.id)], + cid = paste("rhee", sedge.id))], + + ## corresponding cross indicator + vars[type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "REF" & connection == "cross", + .(value = 1, id = straight.config.dt$id[match(og.edge.id, straight.config.dt$og.edge.id)], + cid = paste("rhee", sedge.id))], + + ## actual ALT edges + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s1), + .(value = 1, id, cid = paste("rhee", s1))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s2), + .(value = 1, id, cid = paste("rhee", s2))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s3), + .(value = 1, id, cid = paste("rhee", s3))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(s4), + .(value = 1, id, cid = paste("rhee", s4))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c1), + .(value = 1, id, cid = paste("rhee", c1))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c2), + .(value = 1, id, cid = paste("rhee", c2))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c3), + .(value = 1, id, cid = paste("rhee", c3))], + vars[(!cnloh == TRUE) & type == "edge.indicator" & sedge.id > 0 & ref.or.alt == "ALT" & !is.na(c4), + .(value = 1, id, cid = paste("rhee", c4))], + + ## loose indicators + vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(s) & telomeric == FALSE, + .(value = 1, id, cid = paste("rhee", s))], + vars[type == "loose.in.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE, + .(value = 1, id, cid = paste("rhee", c))], + vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(s) & telomeric == FALSE, + .(value = 1, id, cid = paste("rhee", s))], + vars[type == "loose.out.indicator" & snode.id > 0 & !is.na(c) & telomeric == FALSE, + .(value = 1, id, cid = paste("rhee", c))] + ) + + rhs = unique(rhomol.constraints[, .(value = 2, sense = "L", cid)], by = "cid") + + if (verbose) { + message("Number of reciprocal homologous constraints: ", nrow(rhs)) + } + + constraints = rbind(constraints, rhomol.constraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + } + + ## add the edge indicator sum constraints (ISM consistency) + iconstraints = unique( + vars[type == "edge.indicator" & ref.or.alt == "ALT", + .(value = 1, id, og.edge.id, + edge.id = abs(sedge.id), + cid = paste("edge.indicator.sum.ub", og.edge.id))], + by = "edge.id" + ) + + constraints = rbind( + constraints, + iconstraints[, .(value, id, cid)], + fill = TRUE) + + edge.indicator.b = unique( + vars[type == "edge.indicator" & ref.or.alt == "ALT", + .(value = 1, sense = "L", cid = paste("edge.indicator.sum.ub", og.edge.id))], + by = "cid" + ) + + b = rbind(b, edge.indicator.b, fill = TRUE) + + ## force major allele to have higher CN than minor allele + ## may not work for phased blocks + if (force.major) { + + iconstraints = rbind( + vars[type == "node" & allele == "major" & snode.id > 0, + .(value = 1, id, cid = paste("force.major", og.node.id))], + vars[type == "node" & allele == "minor" & snode.id > 0, + .(value = -1, id, cid = paste("force.major", og.node.id))]) + + rhs = unique(vars[type == "node" & snode.id > 0 & allele == "major", + .(value = 0, sense = "G", cid = paste("force.major", og.node.id))], + by = "cid") + + constraints = rbind(constraints, iconstraints, fill = TRUE) + b = rbind(b, rhs, fill = TRUE) + + } + + + ## force nonzero CN for ALT edges (because these have nonzero CN in original JaBbA output) + ## can become infeasible if original graph is not compatible with ISM + if (force.alt) { + + if (ism) { + warning("Forcing ALT edges while running ISM can make some problems infeasible!") + } + + iconstraints = unique( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE & sedge.id > 0, + .(value = 1, id, og.edge.id, + edge.id = abs(sedge.id), + cid = paste("edge.indicator.sum.lb", og.edge.id))], + by = "edge.id" + ) + + constraints = rbind( + constraints, + iconstraints[, .(value, id, cid)], + fill = TRUE) + + edge.indicator.b = unique( + vars[type == "edge.indicator" & ref.or.alt == "ALT" & cnloh != TRUE & sedge.id > 0, + .(value = 1, sense = "G", cid = paste("edge.indicator.sum.lb", og.edge.id))], + by = "cid" + ) + + b = rbind(b, edge.indicator.b, fill = TRUE) + } + } + + + if (L0) ## add "big M" constraints + { + ## indicator constraints ie on ulids + iconstraints = rbind( + vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))], + vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ + match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + } + + if (L0) ## add "big M" constraints + { + ## indicator constraints ie on ulids + iconstraints = rbind( + vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.ub', ulid))], + vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.ub', ulid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ + match(iconstraints$ulid, ulid), .(value = -M, id, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## upper bound sense is 'L' i.e. less than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.ub', ulid))], + vars[type == 'loose.out', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.ub', ulid))], + fill = TRUE) + + ## lower bound 0.1 if indicator positive, 0 otherwise + iconstraints = rbind( + vars[type == 'loose.out', .(value = 1, id, ulid, cid = paste('loose.out.indicator.lb', ulid))], + vars[type == 'loose.in', .(value = 1, id, ulid, cid = paste('loose.in.indicator.lb', ulid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator', 'loose.in.indicator'), ][ + match(iconstraints$ulid, ulid), .(value = -.1, id, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## lower bound sense is 'G' i.e. greater than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.lb', ulid))], + vars[type == 'loose.out', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.lb', ulid))], + fill = TRUE) + + if (loose.collapse) + { + ################## + ## loose indicator sum = sum of indicators + ################## + iconstraints = rbind( + vars[type == 'loose.out.indicator', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum', lid))], + vars[type == 'loose.in.indicator', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum', lid))], + fill = TRUE) + + ## indicator sum is the sum of all indicators mapping to that loose end + iconstraints = rbind( + iconstraints, + unique(vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ + match(iconstraints$lid, lid), .(value = -1, id, lid, cid = iconstraints$cid)], by = 'lid'), + fill = TRUE) + + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + b = rbind(b, + vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.in.indicator.sum', lid))], + vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'E', cid = paste('loose.out.indicator.sum', lid))], + fill = TRUE) + + ################## + ## now we make new indicator variables on the sum of the individual loose end indicators + ## upper bound bound 0.1 if indicator positive, 0 otherwise + ################## + + iconstraints = rbind( + vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.ub', lid))], + vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.ub', lid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator.sum.indicator', 'loose.in.indicator.sum.indicator'), ][ + match(iconstraints$lid, lid), .(value = -M, id, lid, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## upper bound sense is 'L' i.e. less than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.in.indicator.sum.indicator.ub', lid))], + vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'L', cid = paste('loose.out.indicator.sum.indicator.ub', lid))], + fill = TRUE) + + ## lower bound 0.1 if indicator positive, 0 otherwise + iconstraints = rbind( + vars[type == 'loose.out.indicator.sum', .(value = 1, id, lid, cid = paste('loose.out.indicator.sum.indicator.lb', lid))], + vars[type == 'loose.in.indicator.sum', .(value = 1, id, lid, cid = paste('loose.in.indicator.sum.indicator.lb', lid))], + fill = TRUE) + + ## add the matching indicator variables, matching to the cid from above + iconstraints = rbind( + iconstraints, + vars[type %in% c('loose.out.indicator.sum', 'loose.in.indicator.sum'), ][ + match(iconstraints$lid, lid), .(value = -.1, id, lid, cid = iconstraints$cid)], + fill = TRUE) + + ## upper bounds "infinity" ie M if indicator positive, 0 otherwise + constraints = rbind( + constraints, + iconstraints, + fill = TRUE) + + ## lower bound sense is 'G' i.e. greater than because -M on left hand side + b = rbind(b, + vars[type == 'loose.in.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.in.indicator.sum.indicator.lb', lid))], + vars[type == 'loose.out.indicator.sum', .(value = 0, sense = 'G', cid = paste('loose.out.indicator.sum.indicator.lb', lid))], + fill = TRUE) + + } + } + + + if (!is.null(marginal) && length(dmarginal)) + { + ## match against nodes and store query.id as rid + ## this will be the constraint id that will allow us + ## to sum the appropriate nodes to constrain to the residual + ov = dmarginal[, c('cn', 'weight')] %*% gg$nodes$gr %>% gr2dt + + ov[, rid := query.id] + + constraints = rbind( + constraints, + rbind( + ## match up vars and marginal by snode.id and populate coefficients + merge.data.table(vars[type == 'node', !"rid"], ov, by = 'snode.id')[, .(value = 1, id , cid = paste('mresidual', rid))], + ## the residual is the difference between the sum and marginal cn + vars[type == 'mresidual' & rid %in% ov$rid, .(value = -1, id, cid = paste('mresidual', rid))], + fill = TRUE), + fill = TRUE + ) + + b = rbind(b, + vars[type == 'mresidual' & rid %in% ov$rid, .(value = cn, sense = 'E', cid = paste('mresidual', rid))], + fill = TRUE) + } + + if (!is.null(emarginal)) { + + emconstraints = rbind( + vars[type == "edge", .(value = 1, id, cid = paste("emresidual", emarginal.id))], + vars[type == "emresidual", .(value = -1, id, cid = paste("emresidual", emarginal.id))] + ) + + constraints = rbind(constraints, emconstraints, fill = TRUE) + + emb = vars[type == "emresidual", .(value = cn, sense = "E", cid = paste("emresidual", emarginal.id))] + + b = rbind(emb, b, fill = TRUE) + } + + ######## + ## MAKE MATRICES + ######## + + ## now Rcplex time + ## remove any rows with b = NA + ## get rid of any constraints with NA values + keep.constraints = intersect(b[!is.na(value), cid], constraints[!is.na(value), cid]) + b = b[cid %in% keep.constraints,] + constraints = constraints[cid %in% keep.constraints,] + + ## convert constraints to integers + ucid = unique(b$cid) + b[, cid.char := cid] + b[, cid := cid %>% factor(ucid) %>% as.integer] + constraints[, cid.char := cid] + constraints[, cid := cid %>% factor(ucid) %>% as.integer] + + pmt = match(ucid, b$cid.char) ## get right permutation + bvec = b[pmt, value] + sense = b[pmt, sense] + if (verbose) { + message("Unique cids (A): ", length(unique(constraints$cid))) + message("Unique cids (b): ", length(unique(b$cid))) + message("Number of variables: ", length(unique(constraints$id))) + } + + ## create constraint matrix, Qmat, and cobj, lb, ub from vars and constraints lambda = 10 + Amat = sparseMatrix(constraints$cid, constraints$id, x = constraints$value, dims = c(length(ucid), nrow(vars))) + vars[is.na(weight), weight := 0] + + if (verbose) { + + message("bvec length: ", length(bvec)) + message("Amat nrow: ", nrow(Amat)) + + } + if (any(ix <- is.infinite(vars$weight))) + { + warning('nodes with infinite weight, setting to 0, please check inputs') + vars[ix, weight := 0] + } + Qmat = vars[, weight * (type %in% c('nresidual', 'eresidual', 'mresidual'))] %>% as.numeric %>% Diagonal(x = .) %>% as('CsparseMatrix') + + ## set lambda to 0 at terminal or other non NA nodes + vars[is.na(lambda), lambda := 0] + + + ## set cvec by multiplying global lambda by local lambda for non-terminal loose end + ## vars (or their indicators if L0 is TRUE) + if (L0) + { + if (loose.collapse) + { + cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator') & !terminal)] %>% as.numeric) + ## cvec = lambda*(vars[, lambda*(type %in% c('loose.in.indicator.sum.indicator', 'loose.out.indicator.sum.indicator', 'loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) + } + else + { + cvec = lambda*(vars[, lambda * (type %in% c('loose.in.indicator', 'loose.out.indicator') & !terminal)] %>% as.numeric) + } + } else { + cvec = lambda*(vars[, lambda*(type %in% c('loose.in', 'loose.out') & !terminal)] %>% as.numeric) + } + + ## message("CVEC: ", length(cvec)) + + if (length(indices <- which(vars[, type == "edge.indicator" & reward != 0]))) + { + if (verbose) { + } + message('Applying reward') + ## grab edge indicator variables with reward + cvec[indices] = -vars$reward[indices] + } + + if (lp) { + ## add weights of stuff + indices = which(vars$type %in% c("mdelta.plus", "mdelta.minus", + "ndelta.plus", "ndelta.minus", + "edelta.plus", "edelta.minus")) + wts = vars$weight[indices] + cvec[indices] = wts + Qmat = NULL ## no Q if solving LP + } + + ## browser() + if (cnloh) { + + if ("cnloh" %in% colnames(vars)) { + indices = which(vars$type == "edge.indicator" & !is.na(vars$cnloh) & vars$cnloh == TRUE) + cvec[indices] = lambda + + message("Number of penalized CNLOH edges: ", length(indices)) + } + } + + ## check constraints of CNLOH + ## browser() + ## vars[type == "edge.indicator" & cnloh == TRUE] + ## vars[type == "edge.indicator" & cnloh == TRUE, .N, by = og.edge.id] + + + lb = vars$lb + ub = vars$ub + + control = list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, round = 1, trelim = trelim, nodefileind = nodefileind, method = 4) + + ## call our wrapper for CPLEX + if (use.gurobi) { + + if (verbose) { message("Starting optimization with gurobi!") } + + sol = run_gurobi(cvec = cvec, + Amat = Amat, + bvec = bvec, + Qmat = Qmat, + lb = lb, + ub = ub, + sense = sense, + vtype = vars$vtype, + objsense = "min", + control = control) + } else { + + if (verbose) { message("Starting optimization with CPLEX!") } + + sol = gGnome:::Rcplex2(cvec, + Amat, + bvec, + Qmat = Qmat, + lb = lb, + ub = ub, + sense = sense, + vtype = vars$vtype, + objsense = "min", + control = control, + tuning = FALSE) + } + + vars$cvec = cvec + vars$x = sol$x + + ## for debugging + ppc = function(x) (x %>% merge(vars, by = 'id') %>% merge(b, by = 'cid.char'))[, paste(paste(round(value.x, 1), '*', paste(type, gid, sep= '_'), '(', signif(x, 2), ')', collapse = ' + '), ifelse(sense[1] == 'E', '=', ifelse(sense[1] == 'G', '>=', '<=')), round(value.y[1],2)), by = cid.char] + + ppv = function(x) {tmp = x %>% merge(constraints, by = 'id'); constraints[cid %in% tmp$cid, ] %>% ppc} + + .check = function(x) data.table(obs = sign(as.numeric(round(Amat %*% x - bvec))), + sense) + chk = .check(sol$x) + + if (any(is.na(sol$x))) + stop('Rcplex did not converge or failed to find a solution, please run with verbose = 2 to get more detailed output') + + if (chk[sense == 'E', any(obs != 0, na.rm = TRUE)] | + chk[sense == 'G', any(obs < 0, na.rm = TRUE)] | + chk[sense == 'L', any(obs > 0, na.rm = TRUE)]) + stop('Constraint violation likely due to M parameter being too large for problem causing CPLEX numerical instability, consider lowering M parameter') + + ##.obj = function(x) 0.5 * rbind(x) %*% Qmat %*% cbind(x) + cvec %*% x + + ## mark haplotypes if phasing + if (phased) { + haplotypes.dt = vars[type == "haplotype" & snode.id > 0, + .(node.id = snode.id, + haplotype = ifelse(x == 1, "h1", "h2"), + col = ifelse(x == 1, alpha("red", 0.5), alpha("blue", 0.5)))] + gg$nodes[haplotypes.dt$node.id]$mark(haplotype = haplotypes.dt$haplotype) + gg$nodes[haplotypes.dt$node.id]$mark(col = haplotypes.dt$col) + } + + ## update graph + nmark = vars[type == 'node', .(nid = abs(snode.id), cn = round(x))] + emark = vars[type == 'edge', .(eid = abs(sedge.id), cn = round(x))] + + loosei = vars[type == 'loose.in' & snode.id>0, .(cn = round(x)), keyby = snode.id] + looseo = vars[type == 'loose.out' & snode.id>0, .(cn = round(x)), keyby = snode.id] + + nodes = gg$nodes[loosei$snode.id] ## need to do this to use nodes active binding settings + nodes$loose.left = loosei$cn>0 + + nodes = gg$nodes[looseo$snode.id] ## need to do this to use nodes active binding settings + nodes$loose.right = looseo$cn>0 + + gg$nodes$mark(loose.cn.left = 0, loose.cn.right = 0) + gg$nodes[loosei$snode.id]$mark(loose.cn.left = loosei$cn) + gg$nodes[looseo$snode.id]$mark(loose.cn.right = looseo$cn) + + ## cache old cn values + gg$nodes$mark(cn.old = gg$nodes$dt$cn) + gg$edges$mark(cn.old = gg$edges$dt$cn) + gg$nodes$mark(cn = NULL) ## reset to avoid weird type casting issue + gg$edges$mark(cn = NULL) ## reset to avoid weird type casting issue + gg$nodes[nmark$nid]$mark(cn = nmark$cn) + gg$edges[emark$eid]$mark(cn = emark$cn) + gg$set(y.field = 'cn') + + gg$set(obj = sol$obj) + gg$set(status = sol$status) + gg$set(epgap = sol$epgap) + if (!use.gurobi) { + gg$set(code = readRDS(system.file('extdata', 'cplex_codes.rds', package="gGnome"))[.(sol$status), code]) + } + + if (verbose) { + message("CPLEX epgap ", sol$epgap, " with solution status ", gg$meta$code) + } + + ## fix loose ends + nodes = gg$nodes + nodes$loose.left = nodes$dt$loose.cn.left>0 + nodes$loose.right = nodes$dt$loose.cn.right>0 + + ## if phased, mark edges with different colors to make it easier to visualize + if (phased) { + if (verbose) { + message("formatting phased graph...") + } + ## edge formatting + ref.edge.col = alpha("blue", 0.5) + alt.edge.col = alpha("red", 0.5) + ref.edge.lwd = 1.0 + alt.edge.lwd = 1.0 + edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col) + edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd) + gg$edges$mark(col = edge.col, lwd = edge.lwd) + + ## mark zero cn edges + zero.cn.col = alpha("gray", 0.1) + zero.cn.lwd = 0.5 + zero.cn.edges = which(gg$edges$dt$cn == 0) + gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd) + } else { + + ## edge formatting + ref.edge.col = alpha("blue", 0.2) + alt.edge.col = alpha("red", 0.4) + ref.edge.lwd = 0.5 + alt.edge.lwd = 1.0 + edge.col = ifelse(gg$edges$dt$type == "REF", ref.edge.col, alt.edge.col) + edge.lwd = ifelse(gg$edges$dt$type == "REF", ref.edge.lwd, alt.edge.lwd) + gg$edges$mark(col = edge.col, lwd = edge.lwd) + + ## mark zero cn edges + zero.cn.col = alpha("gray", 0) + zero.cn.lwd = 0.5 + zero.cn.edges = which(gg$edges$dt$cn == 0) + gg$edges[zero.cn.edges]$mark(col = zero.cn.col, lwd = zero.cn.lwd) + } + + ## if nonintegral also return the offsets as graph metadata + ## maybe it will be useful + if (nonintegral) { + gg$set(meta = vars[type == "slush", .(chr, offset = x)]) + } + + if (debug) { + return(list(gg = gg, sol = sol)) + } + return(gg) + } res = balance(binstats.gg, lambda = opt$lambda, diff --git a/bin/non_integer_balance.R b/bin/non_integer_balance.R index 64d365f..041e2cd 100644 --- a/bin/non_integer_balance.R +++ b/bin/non_integer_balance.R @@ -41,8 +41,46 @@ library(skitools) library(JaBbA) library(gGnome) + library(zitools) ## devtools::load_all("~/git/gGnome") ## noninteger capacity now added to master branch - devtools::load_all("~/git/zitools") + + #' @name grab.hets + #' @title grab.hets + #' + #' @description + #' + #' returns allele gtrack given sites.txt from het pileup + #' + #' @param agt.fname (character) path to sites.txt + #' @param min.frac (numeric) between 0 and 1, min frequency in normal to count as het site + #' @param max.frac (numeric) between 0 and 1, max frequency in normal to count as het site + #' + #' @return allele gTrack + grab.hets = function(agt.fname = NULL, + min.frac = 0.2, + max.frac = 0.8) + { + if (is.null(agt.fname) || !file.exists(agt.fname)) { + stop("agt.fname does not exist") + } + + ## prepare and filter + agt.dt = fread(agt.fname)[alt.frac.n > min.frac & alt.frac.n < max.frac,] + ## add major and minor + agt.dt[, which.major := ifelse(alt.count.t > ref.count.t, "alt", "ref")] + agt.dt[, major.count := ifelse(which.major == "alt", alt.count.t, ref.count.t)] + agt.dt[, minor.count := ifelse(which.major == "alt", ref.count.t, alt.count.t)] + + ## melt the data frame + agt.melted = rbind(agt.dt[, .(seqnames, start, end, count = major.count, allele = "major")], + agt.dt[, .(seqnames, start, end, count = minor.count, allele = "minor")] + ) + + ## make GRanges + agt.gr = dt2gr(agt.melted[, .(seqnames, start, end, count, allele)]) + + return (agt.gr) + } #' @name grab.hets.from.maf #' @title grab.hets.from.maf diff --git a/conf/igenomes.config b/conf/igenomes.config index be21320..d1dac35 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -48,10 +48,10 @@ params { hapmap_sites = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/hapmap_3.3.b37.vcf.gz" pon_dryclean = "${params.mski_base}/dryclean/pon/hg19/fixed.detergent.rds" blacklist_coverage_jabba = "${params.mski_base}/JaBbA/blacklist_coverage/hg19/maskA_re.rds" - gencode_fusions = "${params.mski_base}/fusions/gencode/hg19/gencode.bed" + gencode_fusions = "${params.mski_base}/fusions/hg19/gencode.v29lift37.annotation.nochr.rds" build_non_integer_balance = "hg19" mask_non_integer_balance = "${params.mski_base}/allelic_cn/non_integer_balance/hg19/mask_with_segdups.rds" - mask_lp_phased_balance = "${params.mski_base}/allelic_cn/lp_phased_balance/maskA_re.rds" + mask_lp_phased_balance = "${params.mski_base}/allelic_cn/lp_phased_balance/lp_phased_balance_maskA_re.rds" } 'GATK.GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" @@ -95,7 +95,7 @@ params { blacklist_coverage_jabba = "${params.mski_base}/JaBbA/blacklist_coverage/hg38/hg38.coverage.mask.rds" build_non_integer_balance = "hg38" mask_non_integer_balance = "${params.mski_base}/allelic_cn/non_integer_balance/hg38/mask_with_segdups.rds" - mask_lp_phased_balance = "${params.mski_base}/allelic_cn/lp_phased_balance/maskA_re.rds" + mask_lp_phased_balance = "${params.mski_base}/allelic_cn/lp_phased_balance/lp_phased_balance_maskA_re.rds" } 'GRCh37' { diff --git a/conf/modules/alleic_cn.config b/conf/modules/alleic_cn.config index a5e459b..0fe7d71 100644 --- a/conf/modules/alleic_cn.config +++ b/conf/modules/alleic_cn.config @@ -23,24 +23,6 @@ process { ] } - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE_WITH_GRIDSS' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/non_integer_balance_with_gridss/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE_WITH_SVABA' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/non_integer_balance_with_svaba/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE' { ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } publishDir = [ @@ -49,22 +31,4 @@ process { pattern: "*{.rds*,.command.*}" ] } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE_WITH_GRIDSS' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/lp_phased_balance_with_gridss/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE_WITH_SVABA' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/lp_phased_balance_with_svaba/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } } diff --git a/conf/modules/allelic_cn.config b/conf/modules/allelic_cn.config new file mode 100644 index 0000000..a5e459b --- /dev/null +++ b/conf/modules/allelic_cn.config @@ -0,0 +1,70 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ +// ALLEIC_CN configs + +process { + + withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE' { + ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/alleic_cn/non_integer_balance/${meta.id}/" }, + pattern: "*{.rds*,.command.*}" + ] + } + + withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE_WITH_GRIDSS' { + ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/alleic_cn/non_integer_balance_with_gridss/${meta.id}/" }, + pattern: "*{.rds*,.command.*}" + ] + } + + withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE_WITH_SVABA' { + ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/alleic_cn/non_integer_balance_with_svaba/${meta.id}/" }, + pattern: "*{.rds*,.command.*}" + ] + } + + withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE' { + ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/alleic_cn/lp_phased_balance/${meta.id}/" }, + pattern: "*{.rds*,.command.*}" + ] + } + + withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE_WITH_GRIDSS' { + ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/alleic_cn/lp_phased_balance_with_gridss/${meta.id}/" }, + pattern: "*{.rds*,.command.*}" + ] + } + + withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE_WITH_SVABA' { + ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/alleic_cn/lp_phased_balance_with_svaba/${meta.id}/" }, + pattern: "*{.rds*,.command.*}" + ] + } +} diff --git a/main.nf b/main.nf index 2f13b12..5a41047 100644 --- a/main.nf +++ b/main.nf @@ -60,6 +60,10 @@ params.build_dryclean = WorkflowMain.getGenomeAttribute(params, 'build_dr params.hapmap_sites = WorkflowMain.getGenomeAttribute(params, 'hapmap_sites') params.pon_dryclean = WorkflowMain.getGenomeAttribute(params, 'pon_dryclean') params.blacklist_coverage_jabba = WorkflowMain.getGenomeAttribute(params, 'blacklist_coverage_jabba') +params.gencode_fusions = WorkflowMain.getGenomeAttribute(params, 'gencode_fusions') +params.build_non_integer_balance = WorkflowMain.getGenomeAttribute(params, 'build_non_integer_balance') +params.mask_non_integer_balance = WorkflowMain.getGenomeAttribute(params, 'mask_non_integer_balance') +params.mask_lp_phased_balance = WorkflowMain.getGenomeAttribute(params, 'mask_lp_phased_balance') //params.blacklist_junctions_jabba = WorkflowMain.getGenomeAttribute(params, 'blacklist_junctions_jabba') /* diff --git a/modules/local/allelic_cn/main.nf b/modules/local/allelic_cn/main.nf index d96241d..46e8d21 100644 --- a/modules/local/allelic_cn/main.nf +++ b/modules/local/allelic_cn/main.nf @@ -4,8 +4,8 @@ process NON_INTEGER_BALANCE { label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://mskilab/allelic_cn:latest': - 'mskilab/allelic_cn:latest' }" + '/gpfs/commons/groups/imielinski_lab/home/sdider/Projects/nf-jabba/tests/test_runs/work/singularity/jabba_cplex_latest.sif': + 'mskilab/jabba:latest' }" input: tuple val(meta), path(jabba_rds), path(decomposed_cov), path(het_pileups_wgs) @@ -23,6 +23,8 @@ process NON_INTEGER_BALANCE { val(tilim) val(gurobi) path(fasta) // path to decoy fasta + path(fasta_fai) // path to decoy fasta + path(bwa_index) val(pad) output: @@ -37,9 +39,11 @@ process NON_INTEGER_BALANCE { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" def id = "${meta.sample}" + def bwa = bwa_index ? "ln -nfs \$(readlink -f ${bwa_index})/* \$(dirname \$(readlink -f $fasta))/" : "" def VERSION = '0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ + ${bwa} export RSCRIPT_PATH=\$(echo "${baseDir}/bin/non_integer_balance.R") @@ -49,12 +53,12 @@ process NON_INTEGER_BALANCE { --cov $decomposed_cov \\ --field $field \\ --hets $het_pileups_wgs \\ - --hets-thresh $hets_thresh \\ + --hets_thresh $hets_thresh \\ --mask $mask \\ --overwrite $overwrite \\ --lambda $lambda \\ --allin $allin \\ - --fix_thresh $fix_thres \\ + --fix_thresh $fix_thresh \\ --nodebounds $nodebounds \\ --ism $ism \\ --build $build \\ @@ -85,12 +89,12 @@ process NON_INTEGER_BALANCE { process LP_PHASED_BALANCE { - tag "$id" + tag "$meta.id" label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://mskilab/allelic_cn:latest': - 'mskilab/allelic_cn:latest' }" + '/gpfs/commons/groups/imielinski_lab/home/sdider/Projects/nf-jabba/tests/test_runs/work/singularity/jabba_cplex_latest.sif': + 'mskilab/jabba:latest' }" input: tuple val(meta), path(hets_gg), path(hets) // output from non_integer_balance, sites.txt from hetpileups @@ -123,7 +127,9 @@ process LP_PHASED_BALANCE { def VERSION = '0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - Rscript \${baseDir}/bin/lp_phased_balance.R \\ + export RSCRIPT_PATH=\$(echo "${baseDir}/bin/lp_phased_balance.R") + + Rscript \$RSCRIPT_PATH \\ --id $id \\ --jab $hets_gg \\ --hets $hets \\ @@ -146,7 +152,7 @@ process LP_PHASED_BALANCE { cat <<-END_VERSIONS > versions.yml "${task.process}": - non_integer_balance: ${VERSION} + lp_phased_balance: ${VERSION} END_VERSIONS """ } diff --git a/modules/local/ascat/main.nf b/modules/local/ascat/main.nf index d37999f..40db238 100644 --- a/modules/local/ascat/main.nf +++ b/modules/local/ascat/main.nf @@ -8,7 +8,7 @@ process ASCAT_SEG { 'mskilab/ascat_seg:latest' }" input: - tuple val(meta), path(hets), path(cbs_cov) // channel: [mandatory] [ meta, hets ] + tuple val(meta), path(hets), path(cbs_cov) // channel: [mandatory] [ meta, hets, cbs_cov ] val(field) // channel: [mandatory] "foreground" for dryclean/ "ratio" val(hets_thresh) // channel: cutoff for hetpileups; default=0.2 val(penalty) // channel: penalty for ASCAT; default=70 diff --git a/modules/local/cbs/main.nf b/modules/local/cbs/main.nf index d1d36a0..7dd788d 100644 --- a/modules/local/cbs/main.nf +++ b/modules/local/cbs/main.nf @@ -29,7 +29,6 @@ process CBS { def VERSION = '0.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. """ - #RSCRIPT_PATH=\$(if [[ ${workflow.containerEngine} == "singularity" && !task.ext.singularity_pull_docker_container ]]; then echo "/cbsFH.R"; else echo "\${baseDir}/bin/cbsFH.R"; fi) export RSCRIPT_PATH=\$(echo "${baseDir}/bin/cbsFH.R") Rscript \$RSCRIPT_PATH \\ -t ${tumor_dryclean_cov} \ diff --git a/modules/local/dryclean/main.nf b/modules/local/dryclean/main.nf index 3576265..f643540 100644 --- a/modules/local/dryclean/main.nf +++ b/modules/local/dryclean/main.nf @@ -3,14 +3,14 @@ process DRYCLEAN { label 'process_medium' container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'docker://mskilab/dryclean:0.0.2': - 'mskilab/dryclean:0.0.2' }" + 'docker://mskilab/dryclean:0.0.4': + 'mskilab/dryclean:0.0.4' }" input: tuple val(meta), path(input) path(pon) - val(centered) + val(center) val(cbs) val(cnsignif) val(wholeGenome) @@ -18,7 +18,6 @@ process DRYCLEAN { val(blacklist_path) val(germline_filter) val(germline_file) - val(human) val(field) val(build) @@ -33,7 +32,7 @@ process DRYCLEAN { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = '0.0.2' + def VERSION = '0.0.4' """ #!/bin/bash set -o allexport @@ -70,7 +69,7 @@ process DRYCLEAN { CMD="Rscript \$drycln \\ --input ${input} \\ --pon ${pon} \\ - --center ${centered} \\ + --center ${center} \\ --cbs ${cbs} \\ --cnsignif ${cnsignif} \\ --cores ${task.cpus} \\ @@ -79,7 +78,6 @@ process DRYCLEAN { --blacklist_path ${blacklist_path} \\ --germline.filter ${germline_filter} \\ --germline.file ${germline_file} \\ - --human ${human} \\ --field ${field} \\ --build ${build} \\ " @@ -100,8 +98,6 @@ process DRYCLEAN { """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" """ touch drycleaned.cov.rds """ diff --git a/modules/local/fusions/main.nf b/modules/local/fusions/main.nf index c4d3700..9879e86 100644 --- a/modules/local/fusions/main.nf +++ b/modules/local/fusions/main.nf @@ -27,12 +27,13 @@ process FUSIONS { """ - export RSCRIPT_PATH=\$(echo "${baseDir}/bin/fusions.R") + export RSCRIPT_PATH=\$(echo "${baseDir}/bin/Fusions.R") Rscript \$RSCRIPT_PATH \\ --id $id \\ --gGraph $gGraph \\ --gencode $gencode \\ + --cores ${task.cpus} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/gridss/gridss/main.nf b/modules/local/gridss/gridss/main.nf index 6856c7a..feb437e 100644 --- a/modules/local/gridss/gridss/main.nf +++ b/modules/local/gridss/gridss/main.nf @@ -10,10 +10,10 @@ process GRIDSS_GRIDSS { input: - tuple val(meta), path(normalbam, stageAs: "normal.bam"), path(normalbai, stageAs: "normal.bam.bai"), path(tumorbam, stageAs: "tumor.bam"), path(tumorbai, stageAs: "tumor.bam.bai") // required: [meta, normal_cram, normal_crai, tumor_cram, tumor_crai] - path(fasta) // required: reference fasta + tuple val(meta), path(normalbam, stageAs: "normal.bam"), path(normalbai, stageAs: "normal.bam.bai"), path(tumorbam, stageAs: "tumor.bam"), path(tumorbai, stageAs: "tumor.bam.bai") + path(fasta) path(fasta_fai) - path(bwa_index) // required: bwa index folder + path(bwa_index) path(blacklist_gridss) // optional: gridss blacklist bed file based on genome diff --git a/nextflow.config b/nextflow.config index 1253643..b5ccb28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,7 +60,7 @@ params { // fragCounter options midpoint_frag = "TRUE" // If TRUE only count midpoint if FALSE then count bin footprint of every fragment interval: Default=TRUE - windowsize_frag = 200 // Window / bin size : Default=200 (but dryclean uses 1000 binsize) + windowsize_frag = 1000 // Window / bin size : Default=200 (but dryclean uses 1000 binsize) minmapq_frag = 60 // Minimal map quality : Default = 1 paired_frag = "TRUE" // Is the dataset paired : Default = TRUE exome_frag = "FALSE" // Use exons as bins instead of fixed window : Default = FALSE @@ -74,7 +74,6 @@ params { blacklist_path_dryclean = "NA" germline_filter_dryclean = "FALSE" germline_file_dryclean = "NA" - human_dryclean = "TRUE" field_dryclean = "reads" //build_dryclean = "hg19" // This should go inside igenomes.config @@ -451,7 +450,14 @@ includeConfig 'conf/modules/hetpileups.config' // JaBbA configurations includeConfig 'conf/modules/jabba.config' +// Events configurations +includeConfig 'conf/modules/events.config' +// Fusions configurations +includeConfig 'conf/modules/fusions.config' + +// Allelic CN configurations +includeConfig 'conf/modules/allelic_cn.config' // Function to ensure that resource requirements don't go beyond // a maximum limit diff --git a/nextflow_schema.json b/nextflow_schema.json index 2eb9772..e4ac667 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -422,14 +422,6 @@ "help_text": "Path to file annotated with germline calls, if germline.filter == TRUE", "default": "NA" }, - "human_dryclean": { - "type": "string", - "fa_icon": "fas fa-forward", - "description": "Specify if the samples under consideration are human", - "hidden": true, - "help_text": "Specify if the samples under consideration are human", - "default": "TRUE" - }, "field_dryclean": { "type": "string", "fa_icon": "fas fa-forward", diff --git a/subworkflows/local/allelic_cn/main.nf b/subworkflows/local/allelic_cn/main.nf index fbdd051..aed8299 100644 --- a/subworkflows/local/allelic_cn/main.nf +++ b/subworkflows/local/allelic_cn/main.nf @@ -18,6 +18,8 @@ workflow COV_GGRAPH_NON_INTEGER_BALANCE { tilim_non_integer_balance gurobi_non_integer_balance fasta_non_integer_balance // path to decoy fasta + fasta_fai_non_integer_balance // path to decoy fasta.fai + bwa_non_integer_balance pad_non_integer_balance main: @@ -41,6 +43,8 @@ workflow COV_GGRAPH_NON_INTEGER_BALANCE { tilim_non_integer_balance, gurobi_non_integer_balance, fasta_non_integer_balance, + fasta_fai_non_integer_balance, + bwa_non_integer_balance, pad_non_integer_balance ) diff --git a/subworkflows/local/bam_fragCounter/main.nf b/subworkflows/local/bam_fragCounter/main.nf index c40dc5a..7ddc5ed 100644 --- a/subworkflows/local/bam_fragCounter/main.nf +++ b/subworkflows/local/bam_fragCounter/main.nf @@ -33,15 +33,16 @@ workflow BAM_FRAGCOUNTER { versions = FRAGCOUNTER.out.versions corrected_bw = FRAGCOUNTER.out.corrected_bw - REBIN_RAW_FRAGCOUNTER(fragcounter_raw_cov, "reads", 1000) + // REBIN_RAW_FRAGCOUNTER(fragcounter_cov, "reads", 1000) - rebinned_raw_cov = REBIN_RAW_FRAGCOUNTER.out.raw_fragcounter_cov_1kb + // rebinned_raw_cov = REBIN_RAW_FRAGCOUNTER.out.raw_fragcounter_cov_1kb + // rebinned_raw_cov = fragcounter_cov // emit: fragcounter_raw_cov fragcounter_cov - rebinned_raw_cov + // rebinned_raw_cov corrected_bw versions diff --git a/subworkflows/local/cov_dryclean/main.nf b/subworkflows/local/cov_dryclean/main.nf index 3b0c95b..af91097 100644 --- a/subworkflows/local/cov_dryclean/main.nf +++ b/subworkflows/local/cov_dryclean/main.nf @@ -9,7 +9,7 @@ workflow COV_DRYCLEAN { take: input_dryclean // channel: [mandatory] [ meta, cov(.rds file) ] pon_dryclean - centered_dryclean + center_dryclean cbs_dryclean cnsignif_dryclean wholeGenome_dryclean @@ -17,7 +17,6 @@ workflow COV_DRYCLEAN { blacklist_path_dryclean germline_filter_dryclean germline_file_dryclean - human_dryclean field_dryclean build_dryclean @@ -26,10 +25,20 @@ workflow COV_DRYCLEAN { dryclean_cov = Channel.empty() //dryclean_obj = Channel.empty() - DRYCLEAN(input_dryclean, pon_dryclean, centered_dryclean, cbs_dryclean, - cnsignif_dryclean, wholeGenome_dryclean, blacklist_dryclean, - blacklist_path_dryclean, germline_filter_dryclean, germline_file_dryclean, - human_dryclean, field_dryclean, build_dryclean) + DRYCLEAN( + input_dryclean, + pon_dryclean, + center_dryclean, + cbs_dryclean, + cnsignif_dryclean, + wholeGenome_dryclean, + blacklist_dryclean, + blacklist_path_dryclean, + germline_filter_dryclean, + germline_file_dryclean, + field_dryclean, + build_dryclean + ) dryclean_cov = DRYCLEAN.out.decomposed_cov //dryclean_obj = DRYCLEAN.out.dryclean_object diff --git a/tests/modules/local/dryclean/main.nf.test b/tests/modules/local/dryclean/main.nf.test index 0bc76b7..92f4905 100644 --- a/tests/modules/local/dryclean/main.nf.test +++ b/tests/modules/local/dryclean/main.nf.test @@ -27,9 +27,8 @@ nextflow_process { input[7] = params.blacklist_path_dryclean input[8] = params.germline_filter_dryclean input[9] = params.germline_file_dryclean - input[10] = params.human_dryclean - input[11] = params.field_dryclean - input[12] = params.genomes['GATK.GRCh37'].build_dryclean + input[10] = params.field_dryclean + input[11] = params.genomes['GATK.GRCh37'].build_dryclean """ } } diff --git a/tests/nextflow.config b/tests/nextflow.config index 0cd3aa9..6c7e174 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -65,13 +65,13 @@ params { // fragCounter options midpoint_frag = "TRUE" // If TRUE only count midpoint if FALSE then count bin footprint of every fragment interval: Default=TRUE - windowsize_frag = 200 // Window / bin size : Default=200 (but dryclean uses 1000 binsize) + windowsize_frag = 1000 // Window / bin size : Default=200 (but dryclean uses 1000 binsize) minmapq_frag = 60 // Minimal map quality : Default = 1 paired_frag = "TRUE" // Is the dataset paired : Default = TRUE exome_frag = "FALSE" // Use exons as bins instead of fixed window : Default = FALSE // Dryclean options - centered_dryclean = "TRUE" + center_dryclean = "TRUE" cbs_dryclean = "FALSE" cnsignif_dryclean = 0.00001 wholeGenome_dryclean = "FALSE" @@ -79,8 +79,7 @@ params { blacklist_path_dryclean = "NA" germline_filter_dryclean = "FALSE" germline_file_dryclean = "NA" - human_dryclean = "TRUE" - field_dryclean = "count" + field_dryclean = "reads" // ASCAT options field_ascat = "foreground" @@ -241,8 +240,8 @@ params { genomes { 'GATK.GRCh37' { - fasta = "${params.mski_base}/test_data/human_g1k_v37_decoy.small.fasta" - fasta_fai = "${params.mski_base}/test_data/human_g1k_v37_decoy.small.fasta.fai" + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/Chromosomes" dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" bwa = "${params.mski_base}/test_data/BWAIndex/" @@ -270,9 +269,9 @@ params { indel_mask = "${params.mski_base}/SVABA/hg19/snowman_blacklist.bed" germ_sv_db = "${params.mski_base}/SVABA/hg19/snowman_germline_mini_160413.bed" simple_seq_db = "${params.mski_base}/SVABA/hg19/repeat_masker_hg19_Simple.bed" - blacklist_gridss = "${params.mski_base}/test_data/human_g1k_v37_decoy.small.fasta.bed" + blacklist_gridss = "${params.mski_base}/GRIDSS/blacklist/hg19/human_g1k_v37_decoy.fasta.bed" pon_gridss = "${params.mski_base}/GRIDSS/pon/hg19/" - gcmapdir_frag = "${params.mski_base}/test_data/gcMAP21/" + gcmapdir_frag = "${params.mski_base}/fragcounter/gcmapdir/hg19/" build_dryclean = 'hg19' hapmap_sites = "${params.mski_base}/test_data/hapmap_3.3.b37.vcf.gz" pon_dryclean = "${params.mski_base}/test_data/chr21_pon.rds" diff --git a/workflows/nfjabba.nf b/workflows/nfjabba.nf index ac0b0bc..f908ac0 100644 --- a/workflows/nfjabba.nf +++ b/workflows/nfjabba.nf @@ -50,7 +50,7 @@ def checkPathParamList = [ params.pon_tbi, params.gcmapdir_frag, params.pon_dryclean, - params.blacklist_coverage_jabba + params.blacklist_coverage_jabba, ] def toolParamMap = [ @@ -318,13 +318,12 @@ paired_frag = params.paired_frag ?: Channel.empty() exome_frag = params.exome_frag ?: Channel.empty() // For fragCounter // Dryclean -centered_dryclean = params.centered_dryclean ?: Channel.empty() +center_dryclean = params.centered_dryclean ?: Channel.empty() cbs_dryclean = params.cbs_dryclean ?: Channel.empty() cnsignif_dryclean = params.cnsignif_dryclean ?: Channel.empty() wholeGenome_dryclean = params.wholeGenome_dryclean ?: Channel.empty() blacklist_dryclean = params.blacklist_dryclean ?: Channel.empty() germline_filter_dryclean = params.germline_filter_dryclean ?: Channel.empty() -human_dryclean = params.human_dryclean ?: Channel.empty() field_dryclean = params.field_dryclean ?: Channel.empty() build_dryclean = params.build_dryclean ?: Channel.empty() @@ -398,7 +397,7 @@ ism_lp_phased_balance = params.ism_lp_phased_balance ?: Channel.empty() epgap_lp_phased_balance = params.epgap_lp_phased_balance ?: Channel.empty() hets_thresh_lp_phased_balance = params.hets_thresh_lp_phased_balance ?: Channel.empty() min_bins_lp_phased_balance = params.min_bins_lp_phased_balance ?: Channel.empty() -min_width_lp_phased_balance = params.min_width_lp_phased_balance ?: Channel.empty() +min_width_lp_phased_balance = params.min_width_lp_phased_balance || params.min_width_lp_phased_balance == 0 ? params.min_width_lp_phased_balance : Channel.empty() trelim_lp_phased_balance = params.trelim_lp_phased_balance ?: Channel.empty() reward_lp_phased_balance = params.reward_lp_phased_balance ?: Channel.empty() nodefileind_lp_phased_balance = params.nodefileind_lp_phased_balance ?: Channel.empty() @@ -674,7 +673,7 @@ workflow NFJABBA { boolean runJabba = false boolean runEvents = false boolean runFusions = false - boolean runAlleicCN = false + boolean runAllelicCN = false // Set flags based on params.step using a cascading approach // Fall through to the next case if the previous case is true @@ -704,7 +703,7 @@ workflow NFJABBA { case 'fusions': runFusions = true case 'allelic_cn': - runAlleicCN = true + runAllelicCN = true break default: error "Invalid step: ${params.step}" @@ -1163,10 +1162,10 @@ workflow NFJABBA { if (tools_used.contains('fragcounter')) { NORMAL_FRAGCOUNTER(bam_fragcounter_status.normal, midpoint_frag, windowsize_frag, gcmapdir_frag, minmapq_frag, paired_frag, exome_frag) - normal_frag_cov = Channel.empty().mix(NORMAL_FRAGCOUNTER.out.rebinned_raw_cov) + normal_frag_cov = Channel.empty().mix(NORMAL_FRAGCOUNTER.out.fragcounter_cov) TUMOR_FRAGCOUNTER(bam_fragcounter_status.tumor, midpoint_frag, windowsize_frag, gcmapdir_frag, minmapq_frag, paired_frag, exome_frag) - tumor_frag_cov = Channel.empty().mix(TUMOR_FRAGCOUNTER.out.rebinned_raw_cov) + tumor_frag_cov = Channel.empty().mix(TUMOR_FRAGCOUNTER.out.fragcounter_cov) // Only need one versions because its just one program (fragcounter) versions = versions.mix(NORMAL_FRAGCOUNTER.out.versions) @@ -1243,7 +1242,7 @@ workflow NFJABBA { TUMOR_DRYCLEAN( tumor_frag_cov, pon_dryclean, - centered_dryclean, + center_dryclean, cbs_dryclean, cnsignif_dryclean, wholeGenome_dryclean, @@ -1251,7 +1250,6 @@ workflow NFJABBA { blacklist_path_dryclean, germline_filter_dryclean, germline_file_dryclean, - human_dryclean, field_dryclean, build_dryclean ) @@ -1261,7 +1259,7 @@ workflow NFJABBA { NORMAL_DRYCLEAN( normal_frag_cov, pon_dryclean, - centered_dryclean, + center_dryclean, cbs_dryclean, cnsignif_dryclean, wholeGenome_dryclean, @@ -1269,7 +1267,6 @@ workflow NFJABBA { blacklist_path_dryclean, germline_filter_dryclean, germline_file_dryclean, - human_dryclean, field_dryclean, build_dryclean ) @@ -1326,7 +1323,7 @@ workflow NFJABBA { meta.patient = cov[0] meta.sex = cov[1].sex - [ meta, cov[2], hets[2] ] + [ meta, hets[2], cov[2] ] } } @@ -1349,7 +1346,6 @@ workflow NFJABBA { if (runJabba) { if (tools_used.contains('jabba')) { - if (params.step == 'jabba') { // put all the inputs into a map for easier retrieval jabba_input_map = input_sample @@ -1433,7 +1429,9 @@ workflow NFJABBA { } else { - tumor_dryclean_cov_for_joining = tumor_dryclean_cov.map { meta, tumor_cov -> [meta.patient, meta, tumor_cov] } + meta_for_joining = tumor_dryclean_cov.map{ meta, tumor_cov -> [meta.patient, meta] } // can use any of the inputs to get the meta data + + tumor_dryclean_cov_for_joining = tumor_dryclean_cov.map { meta, tumor_cov -> [meta.patient, tumor_cov] } het_pileups_for_joining = sites_from_het_pileups_wgs.map { meta, hets -> [meta.patient, hets] } @@ -1444,7 +1442,8 @@ workflow NFJABBA { // join all previous outputs to be used as input for jabba // excluding svs since they can come from either svaba or gridss - jabba_inputs = tumor_dryclean_cov_for_joining + jabba_inputs = meta_for_joining + .join(tumor_dryclean_cov_for_joining) .join(het_pileups_for_joining) .join(ploidy_for_joining) .join(cbs_seg_rds_for_joining) @@ -1624,180 +1623,86 @@ workflow NFJABBA { } } - if (runAlleicCN) { + if (runAllelicCN) { if (tools_used.contains('allelic_cn')) { if (params.step == 'allelic_cn') { non_integer_balance_inputs = input_sample.map{ meta, cov, hets, ggraph -> [ meta, ggraph, cov, hets ] } - - NON_INTEGER_BALANCE( - non_integer_balance_inputs, - field_non_integer_balance, - hets_thresh_non_integer_balance, - mask_non_integer_balance, - overwrite_non_integer_balance, - lambda_non_integer_balance, - allin_non_integer_balance, - fix_thresh_non_integer_balance, - nodebounds_non_integer_balance, - ism_non_integer_balance, - build_non_integer_balance, - epgap_non_integer_balance, - tilim_non_integer_balance, - gurobi_non_integer_balance, - fasta, - pad_non_integer_balance - ) - versions = Channel.empty().mix(NON_INTEGER_BALANCE.out.versions) - - non_integer_balance_balanced_gg = Channel.empty().mix(NON_INTEGER_BALANCE.out.non_integer_balance_balanced_gg) - non_integer_balance_hets_gg = Channel.empty().mix(NON_INTEGER_BALANCE.out.non_integer_balance_hets_gg) - - non_integer_balance_balanced_gg_for_joining = non_integer_balance_balanced_gg.map{ meta, balanced -> [ meta.patient, meta, balanced ] } - hets_for_joining = input_sample.map{ meta, cov, hets, ggraph -> [ meta.patient, hets ] } - lp_phased_balance_inputs = non_integer_balance_hets_gg_for_joining.join(hets_for_joining) - .map{ patient, meta, balanced, hets -> [ meta, balanced, hets ] - } - - LP_PHASED_BALANCE( - lp_phased_balance_inputs, - lambda_lp_phased_balance, - cnloh_lp_phased_balance, - major_lp_phased_balance, - allin_lp_phased_balance, - marginal_lp_phased_balance, - from_maf_lp_phased_balance, - mask_lp_phased_balance, - ism_lp_phased_balance, - epgap_lp_phased_balance, - hets_thresh_lp_phased_balance, - min_bins_lp_phased_balance, - min_width_lp_phased_balance, - trelim_lp_phased_balance, - reward_lp_phased_balance, - nodefileind_lp_phased_balance, - tilim_lp_phased_balance - ) - - lp_phased_balance_balanced_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_balanced_gg) - lp_phased_balance_binstats_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_binstats_gg) - lp_phased_balance_unphased_allelic_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_unphased_allelic_gg) + het_pileups_for_joining = input_sample.map{ meta, cov, hets, ggraph -> [ meta.patient, hets ] } } else { if (tools_used.contains('gridss')) { - jabba_rds_with_gridss_for_joining = jabba_rds_with_gridss.map{ meta, rds -> [ meta.patient, meta, rds ] } - non_integer_balance_w_gridss_inputs = jabba_rds_with_gridss_for_joining + jabba_rds_with_gridss_for_joining = jabba_rds_with_gridss.map{ meta, rds -> [ meta.patient, rds ] } + non_integer_balance_inputs = meta_for_joining + .join(jabba_rds_with_gridss_for_joining) .join(tumor_dryclean_cov_for_joining) .join(het_pileups_for_joining) .map{ patient, meta, rds, cov, hets -> [ meta, rds, cov, hets ] } - NON_INTEGER_BALANCE_WITH_GRIDSS( - non_integer_balance_w_gridss_inputs, - field_non_integer_balance, - hets_thresh_non_integer_balance, - mask_non_integer_balance, - overwrite_non_integer_balance, - lambda_non_integer_balance, - allin_non_integer_balance, - fix_thresh_non_integer_balance, - nodebounds_non_integer_balance, - ism_non_integer_balance, - build_non_integer_balance, - epgap_non_integer_balance, - tilim_non_integer_balance, - gurobi_non_integer_balance, - fasta, - pad_non_integer_balance - ) - versions_w_gridss = Channel.empty().mix(NON_INTEGER_BALANCE_WITH_GRIDSS.out.versions) - - non_integer_balance_w_gridss_balanced_gg = Channel.empty().mix(NON_INTEGER_BALANCE_WITH_GRIDSS.out.non_integer_balance_balanced_gg) - non_integer_balance_w_gridss_hets_gg = Channel.empty().mix(NON_INTEGER_BALANCE_WITH_GRIDSS.out.non_integer_balance_hets_gg) - - non_integer_balance_w_gridss_balanced_gg_for_joining = non_integer_balance_w_gridss_balanced_gg.map{ meta, balanced -> [ meta.patient, meta, balanced ] } - lp_phased_balance_w_gridss_inputs = non_integer_balance_w_gridss_balanced_gg_for_joining.join(het_pileups_for_joining) - .map{ patient, meta, balanced, hets -> [ meta, balanced, hets ] } - - LP_PHASED_BALANCE_WITH_GRIDSS( - lp_phased_balance_w_gridss_inputs, - lambda_lp_phased_balance, - cnloh_lp_phased_balance, - major_lp_phased_balance, - allin_lp_phased_balance, - marginal_lp_phased_balance, - from_maf_lp_phased_balance, - mask_lp_phased_balance, - ism_lp_phased_balance, - epgap_lp_phased_balance, - hets_thresh_lp_phased_balance, - min_bins_lp_phased_balance, - min_width_lp_phased_balance, - trelim_lp_phased_balance, - reward_lp_phased_balance, - nodefileind_lp_phased_balance, - tilim_lp_phased_balance - ) - - lp_phased_balance_w_gridss_balanced_gg = Channel.empty().mix(LP_PHASED_BALANCE_WITH_GRIDSS.out.lp_phased_balance_balanced_gg) - lp_phased_balance_w_gridss_binstats_gg = Channel.empty().mix(LP_PHASED_BALANCE_WITH_GRIDSS.out.lp_phased_balance_binstats_gg) - lp_phased_balance_w_gridss_unphased_allelic_gg = Channel.empty().mix(LP_PHASED_BALANCE_WITH_GRIDSS.out.lp_phased_balance_unphased_allelic_gg) } if (tools_used.contains('svaba')) { - jabba_rds_with_svaba_for_joining = jabba_rds_with_svaba.map{ meta, rds -> [ meta.patient, meta, rds ] } - non_integer_balance_w_svaba_inputs = jabba_rds_with_svaba_for_joining + jabba_rds_with_svaba_for_joining = jabba_rds_with_svaba.map{ meta, rds -> [ meta.patient, rds ] } + non_integer_balance_inputs = meta_for_joining + .join(jabba_rds_with_svaba_for_joining) .join(tumor_dryclean_cov_for_joining) .join(het_pileups_for_joining) .map{ patient, meta, rds, cov, hets -> [ meta, rds, cov, hets ] } - NON_INTEGER_BALANCE_WITH_SVABA( - non_integer_balance_w_svaba_inputs, - field_non_integer_balance, - hets_thresh_non_integer_balance, - mask_non_integer_balance, - overwrite_non_integer_balance, - lambda_non_integer_balance, - allin_non_integer_balance, - fix_thresh_non_integer_balance, - nodebounds_non_integer_balance, - ism_non_integer_balance, - build_non_integer_balance, - epgap_non_integer_balance, - tilim_non_integer_balance, - gurobi_non_integer_balance, - fasta, - pad_non_integer_balance - ) - versions_w_svaba = Channel.empty().mix(NON_INTEGER_BALANCE_WITH_SVABA.out.versions) - - non_integer_balance_w_svaba_balanced_gg = Channel.empty().mix(NON_INTEGER_BALANCE_WITH_SVABA.out.non_integer_balance_balanced_gg) - non_integer_balance_w_svaba_hets_gg = Channel.empty().mix(NON_INTEGER_BALANCE_WITH_SVABA.out.non_integer_balance_hets_gg) - - non_integer_balance_w_svaba_balanced_gg_for_joining = non_integer_balance_w_svaba_balanced_gg.map{ meta, balanced -> [ meta.patient, meta, balanced ] } - lp_phased_balance_w_svaba_inputs = non_integer_balance_w_svaba_balanced_gg_for_joining.join(het_pileups_for_joining) - .map{ patient, meta, balanced, hets -> [ meta, balanced, hets ] } - - LP_PHASED_BALANCE_WITH_SVABA( - lp_phased_balance_w_svaba_inputs, - lambda_lp_phased_balance, - cnloh_lp_phased_balance, - major_lp_phased_balance, - allin_lp_phased_balance, - marginal_lp_phased_balance, - from_maf_lp_phased_balance, - mask_lp_phased_balance, - ism_lp_phased_balance, - epgap_lp_phased_balance, - hets_thresh_lp_phased_balance, - min_bins_lp_phased_balance, - min_width_lp_phased_balance, - trelim_lp_phased_balance, - reward_lp_phased_balance, - nodefileind_lp_phased_balance, - tilim_lp_phased_balance - ) - - lp_phased_balance_balanced_gg = Channel.empty().mix(LP_PHASED_BALANCE_WITH_SVABA.out.lp_phased_balance_balanced_gg) - lp_phased_balance_binstats_gg = Channel.empty().mix(LP_PHASED_BALANCE_WITH_SVABA.out.lp_phased_balance_binstats_gg) - lp_phased_balance_unphased_allelic_gg = Channel.empty().mix(LP_PHASED_BALANCE_WITH_SVABA.out.lp_phased_balance_unphased_allelic_gg) } } + + NON_INTEGER_BALANCE( + non_integer_balance_inputs, + field_non_integer_balance, + hets_thresh_non_integer_balance, + mask_non_integer_balance, + overwrite_non_integer_balance, + lambda_non_integer_balance, + allin_non_integer_balance, + fix_thresh_non_integer_balance, + nodebounds_non_integer_balance, + ism_non_integer_balance, + build_non_integer_balance, + epgap_non_integer_balance, + tilim_non_integer_balance, + gurobi_non_integer_balance, + fasta, + fasta_fai, + bwa, + pad_non_integer_balance + ) + versions = Channel.empty().mix(NON_INTEGER_BALANCE.out.versions) + + non_integer_balance_balanced_gg = Channel.empty().mix(NON_INTEGER_BALANCE.out.non_integer_balance_balanced_gg) + non_integer_balance_hets_gg = Channel.empty().mix(NON_INTEGER_BALANCE.out.non_integer_balance_hets_gg) + + non_integer_balance_balanced_gg_for_joining = non_integer_balance_balanced_gg.map{ meta, balanced -> [ meta.patient, balanced ] } + + lp_phased_balance_inputs = meta_for_joining + .join(non_integer_balance_balanced_gg_for_joining) + .join(het_pileups_for_joining) + .map{ patient, meta, balanced_gg, hets -> [ meta, balanced_gg, hets ] } + + LP_PHASED_BALANCE( + lp_phased_balance_inputs, + lambda_lp_phased_balance, + cnloh_lp_phased_balance, + major_lp_phased_balance, + allin_lp_phased_balance, + marginal_lp_phased_balance, + from_maf_lp_phased_balance, + mask_lp_phased_balance, + ism_lp_phased_balance, + epgap_lp_phased_balance, + hets_thresh_lp_phased_balance, + min_bins_lp_phased_balance, + min_width_lp_phased_balance, + trelim_lp_phased_balance, + reward_lp_phased_balance, + nodefileind_lp_phased_balance, + tilim_lp_phased_balance + ) + + lp_phased_balance_balanced_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_balanced_gg) + lp_phased_balance_binstats_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_binstats_gg) + lp_phased_balance_unphased_allelic_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_unphased_allelic_gg) } } } From c66190707e47b869cd753bb673057d319fb52a89 Mon Sep 17 00:00:00 2001 From: shihabdider Date: Wed, 8 May 2024 16:25:22 -0400 Subject: [PATCH 2/3] refactor: put allelic_cn before events calling --- bin/ascat_seg.R | 20 +-- bin/lp_phased_balance.R | 85 ++++++----- bin/non_integer_balance.R | 6 +- tests/nextflow.config | 2 +- workflows/nfjabba.nf | 295 ++++++++++++-------------------------- 5 files changed, 152 insertions(+), 256 deletions(-) diff --git a/bin/ascat_seg.R b/bin/ascat_seg.R index 835a6df..dfe0024 100644 --- a/bin/ascat_seg.R +++ b/bin/ascat_seg.R @@ -1,6 +1,6 @@ { library(optparse) - + ## DO NOT FAIL SILENTLY options(error = function() {traceback(2); quit("no", 1)}) @@ -35,7 +35,7 @@ opt = parse_args(parseobj) print(opt) - + print(.libPaths()) options(error=function()traceback(2)) @@ -59,7 +59,7 @@ #' #' @param gg (gGraph) diploid gGraph (expect cn.low and cn.high as node metadata) #' @param verbose (logical) default FALSE - #' + #' #' @return gGraph with field allele and cn diploid2haploid = function(gg, verbose = FALSE) { og.nodes.gr = gg$nodes$gr[, c("cn.low", "cn.high", "var.low", "var.high", "cn", "node.id", "nhets")] @@ -286,7 +286,7 @@ ## log.p = T) + ## pnbinom(y, mu = centers[tot.cn - j + 1], ## size = centers[tot.cn - j + 1], - ## log.p = T))) + ## log.p = T))) ll = sapply(0:(floor(tot.cn/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + ppois(y,centers[tot.cn-j+1],log.p = T))) ll = ll - min(ll) @@ -300,7 +300,7 @@ ## log.p = T) + ## pnbinom(y, mu = centers[tot.cn - j], ## size = centers[tot.cn - j] / 2, - ## log.p = T))) + ## log.p = T))) ll = sapply(0:(floor((tot.cn - 1)/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + ppois(y,centers[tot.cn-j],log.p = T))) ll = ll - min(ll) @@ -315,7 +315,7 @@ ## log.p = T) + ## pnbinom(y, mu = centers[tot.cn - j + 2], ## size = centers[tot.cn - j + 2], - ## log.p = T))) + ## log.p = T))) ll = sapply(0:(floor((tot.cn + 1)/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + ppois(y,centers[tot.cn-j+2],log.p = T))) ll = ll - min(ll) @@ -363,7 +363,7 @@ ## ## ########### ## ## phasing - ## ## ########### + ## ## ########### ## ## iterate through all reference junctions and apply (wishful thinking) heuristic ## ## @@ -561,7 +561,7 @@ #' @param agt.fname (character) path to sites.txt #' @param min.frac (numeric) between 0 and 1, min frequency in normal to count as het site #' @param max.frac (numeric) between 0 and 1, max frequency in normal to count as het site - #' + #' #' @return allele gTrack grab.hets = function(agt.fname = NULL, min.frac = 0.2, @@ -605,7 +605,7 @@ if (!file.exists(agt.fname)) { stop("invalid file") } - + gr = readRDS(agt.fname) if (!inherits(gr, "GRanges")) { @@ -748,6 +748,8 @@ median(ratio, na.rm = T)] > 0.8, "XX", "XY") + # handle single chr case as XY + gender = ifelse(!is.na(gender), gender, "XY") message("The gender of this sample: ", gender) message("Starting ASCAT!!") diff --git a/bin/lp_phased_balance.R b/bin/lp_phased_balance.R index b3323e4..145635a 100644 --- a/bin/lp_phased_balance.R +++ b/bin/lp_phased_balance.R @@ -705,45 +705,51 @@ dt2gr(hets.dt[, .(seqnames, start, end, strand = "*")]), return.type = "data.table") - nodes.ov.hets[, count := .N, by = query.id] - - ## we should only segment relatively wide nodes with low copy number - nodes.ov.hets = nodes.ov.hets[count >= 100,] - - nodes.to.segment = nodes.ov.hets[, unique(query.id)] - - new.sl = seqlengths(gg) - - segs = lapply(nodes.to.segment, - function(qid) { - message("Starting segmentation for : ", qid) - hets.subset.dt = hets.dt[nodes.ov.hets[query.id == qid, subject.id],] - cna = CNA(hets.subset.dt[, BAF], - hets.subset.dt[, as.character(seqnames)], - hets.subset.dt[, start], - data.type = "logratio") - seg = segment(smooth.CNA(cna), - alpha = 1e-5, - verbose = TRUE) - utils::capture.output({seg_dt = print(seg); setDT(seg_dt)}, - type = "output", - file = "/dev/null") - out = seg2gr(seg_dt[!(is.na(seg.mean) | is.na(loc.start) | is.na(loc.end))], new.sl) - out = gr.fix(out, new.sl, drop = T) - ## get the number of hets per segment - values(out)[, "nhets"] = out %N% hets.gr - ## make sure the segment is on the order of high kbp - out = out %Q% (nhets > 50) - message("Number of segments: ", length(out)) - names(out) = NULL - if (length(out) > 1) { - return(gr.start(out[2:length(out)])) - } - return(GRanges()) - message("Finished!") - }) - - segs = do.call(grbind, segs) + # handle case where nodes.ov.hets is empty + if (nrow(nodes.ov.hets) == 0) { + message("No nodes overlap with hets") + segs = NULL + } else { + nodes.ov.hets[, count := .N, by = query.id] + + ## we should only segment relatively wide nodes with low copy number + nodes.ov.hets = nodes.ov.hets[count >= 100,] + + nodes.to.segment = nodes.ov.hets[, unique(query.id)] + + new.sl = seqlengths(gg) + + segs = lapply(nodes.to.segment, + function(qid) { + message("Starting segmentation for : ", qid) + hets.subset.dt = hets.dt[nodes.ov.hets[query.id == qid, subject.id],] + cna = CNA(hets.subset.dt[, BAF], + hets.subset.dt[, as.character(seqnames)], + hets.subset.dt[, start], + data.type = "logratio") + seg = segment(smooth.CNA(cna), + alpha = 1e-5, + verbose = TRUE) + utils::capture.output({seg_dt = print(seg); setDT(seg_dt)}, + type = "output", + file = "/dev/null") + out = seg2gr(seg_dt[!(is.na(seg.mean) | is.na(loc.start) | is.na(loc.end))], new.sl) + out = gr.fix(out, new.sl, drop = T) + ## get the number of hets per segment + values(out)[, "nhets"] = out %N% hets.gr + ## make sure the segment is on the order of high kbp + out = out %Q% (nhets > 50) + message("Number of segments: ", length(out)) + names(out) = NULL + if (length(out) > 1) { + return(gr.start(out[2:length(out)])) + } + return(GRanges()) + message("Finished!") + }) + + segs = do.call(grbind, segs) + } if (is.null(segs)) { message("No extra breakends") @@ -789,6 +795,7 @@ ##binstats.gg$edges$mark(reward = new.gg$edges$dt[match(binstats.gg$edges$dt$og.edge.id, edge.id), reward]) binstats.gg$edges$mark(cnloh = new.gg$edges$dt[match(binstats.gg$edges$dt$og.edge.id, edge.id), cnloh]) ## binstats.gg$edges$mark(cnloh = gg$edges$dt[match(binstats.gg$edges$dt$og.edge.id, edge.id), cnloh]) + print('tracer2') } else { tmp = as.data.table(readRDS(opt$hets)) tmp = rbind(tmp[, .(seqnames, start, end, strand = "*", diff --git a/bin/non_integer_balance.R b/bin/non_integer_balance.R index 041e2cd..4ea7192 100644 --- a/bin/non_integer_balance.R +++ b/bin/non_integer_balance.R @@ -154,7 +154,11 @@ if ((opt$overwrite) | (!file.exists(binstats.gg.fn))) { message("Starting binstats") - binstats.gg = gGnome::binstats(jab, bins = cov, field = opt$field, lp = TRUE) + if (ncn.x == 0) { # handle no sex chr case by disabling loess + binstats.gg = gGnome::binstats(jab, bins = cov, field = opt$field, lp = TRUE, loess=FALSE) + } else { + binstats.gg = gGnome::binstats(jab, bins = cov, field = opt$field, lp = TRUE) + } ## save binstats saveRDS(binstats.gg, binstats.gg.fn) diff --git a/tests/nextflow.config b/tests/nextflow.config index 6c7e174..b16a731 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -278,7 +278,7 @@ params { blacklist_coverage_jabba = "${params.mski_base}/JaBbA/blacklist_coverage/hg19/maskA_re.rds" build_non_integer_balance = "hg19" mask_non_integer_balance = "${params.mski_base}/allelic_cn/non_integer_balance/hg19/mask_with_segdups.rds" - mask_lp_phased_balance = "${params.mski_base}/JaBbA/blacklist_coverage/hg19/maskA_re.rds" + mask_lp_phased_balance = "${params.mski_base}/allelic_cn/lp_phased_balance/lp_phased_balance_maskA_re.rds" } 'GATK.GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" diff --git a/workflows/nfjabba.nf b/workflows/nfjabba.nf index f908ac0..171bcc9 100644 --- a/workflows/nfjabba.nf +++ b/workflows/nfjabba.nf @@ -521,27 +521,17 @@ include { COV_CBS as CBS } from '../subworkflows/lo // JaBbA include { COV_JUNC_JABBA as JABBA } from '../subworkflows/local/jabba/main' -include { COV_JUNC_JABBA as JABBA_WITH_SVABA } from '../subworkflows/local/jabba/main' -include { COV_JUNC_JABBA as JABBA_WITH_GRIDSS } from '../subworkflows/local/jabba/main' // Events include { GGRAPH_EVENTS as EVENTS } from '../subworkflows/local/events/main' -include { GGRAPH_EVENTS as EVENTS_WITH_GRIDSS } from '../subworkflows/local/events/main' -include { GGRAPH_EVENTS as EVENTS_WITH_SVABA } from '../subworkflows/local/events/main' // Fusions include { GGRAPH_FUSIONS as FUSIONS } from '../subworkflows/local/fusions/main' -include { GGRAPH_FUSIONS as FUSIONS_WITH_GRIDSS } from '../subworkflows/local/fusions/main' -include { GGRAPH_FUSIONS as FUSIONS_WITH_SVABA } from '../subworkflows/local/fusions/main' // Alleic CN include { COV_GGRAPH_NON_INTEGER_BALANCE as NON_INTEGER_BALANCE } from '../subworkflows/local/allelic_cn/main' -include { COV_GGRAPH_NON_INTEGER_BALANCE as NON_INTEGER_BALANCE_WITH_GRIDSS } from '../subworkflows/local/allelic_cn/main' -include { COV_GGRAPH_NON_INTEGER_BALANCE as NON_INTEGER_BALANCE_WITH_SVABA } from '../subworkflows/local/allelic_cn/main' include { COV_GGRAPH_LP_PHASED_BALANCE as LP_PHASED_BALANCE } from '../subworkflows/local/allelic_cn/main' -include { COV_GGRAPH_LP_PHASED_BALANCE as LP_PHASED_BALANCE_WITH_GRIDSS } from '../subworkflows/local/allelic_cn/main' -include { COV_GGRAPH_LP_PHASED_BALANCE as LP_PHASED_BALANCE_WITH_SVABA } from '../subworkflows/local/allelic_cn/main' /* @@ -671,9 +661,9 @@ workflow NFJABBA { boolean runDryClean = false boolean runAscat = false boolean runJabba = false + boolean runAllelicCN = false boolean runEvents = false boolean runFusions = false - boolean runAllelicCN = false // Set flags based on params.step using a cascading approach // Fall through to the next case if the previous case is true @@ -698,12 +688,12 @@ workflow NFJABBA { runAscat = true case 'jabba': runJabba = true + case 'allelic_cn': + runAllelicCN = true case 'events': runEvents = true case 'fusions': runFusions = true - case 'allelic_cn': - runAllelicCN = true break default: error "Invalid step: ${params.step}" @@ -1387,46 +1377,6 @@ workflow NFJABBA { ] } - JABBA( - jabba_inputs - blacklist_junctions_jabba, - geno_jabba, - indel_jabba, - tfield_jabba, - iter_jabba, - rescue_window_jabba, - rescue_all_jabba, - nudgebalanced_jabba, - edgenudge_jabba, - strict_jabba, - allin_jabba, - field_jabba, - maxna_jabba, - blacklist_coverage_jabba, - purity_jabba, - pp_method_jabba, - cnsignif_jabba, - slack_jabba, - linear_jabba, - tilim_jabba, - epgap_jabba, - fix_thres_jabba, - lp_jabba, - ism_jabba, - filter_loose_jabba, - gurobi_jabba, - verbose_jabba - ) - - jabba_rds = Channel.empty().mix(JABBA.out.jabba_rds) - jabba_gg = Channel.empty().mix(JABBA.out.jabba_gg) - jabba_vcf = Channel.empty().mix(JABBA.out.jabba_vcf) - jabba_raw_rds = Channel.empty().mix(JABBA.out.jabba_raw_rds) - opti = Channel.empty().mix(JABBA.out.opti) - jabba_seg = Channel.empty().mix(JABBA.out.jabba_seg) - karyograph = Channel.empty().mix(JABBA.out.karyograph) - versions = versions.mix(JABBA.out.versions) - } else { meta_for_joining = tumor_dryclean_cov.map{ meta, tumor_cov -> [meta.patient, meta] } // can use any of the inputs to get the meta data @@ -1449,14 +1399,14 @@ workflow NFJABBA { .join(cbs_seg_rds_for_joining) .join(cbs_nseg_rds_for_joining) - if (tools_used.contains('svaba')) { + if (tools_used.contains('gridss')) { - vcf_from_sv_calling_for_joining = vcf_from_sv_calling.map{ meta, juction -> [ meta.patient, junction ] } - unfiltered_som_sv_for_joining = j_supp.map{ meta, j_supp -> [ meta.patient, j_supp ] } + vcf_from_sv_calling_gridss_for_joining = vcf_from_sv_calling_gridss.map{ meta, junction -> [ meta.patient, junction ] } + unfiltered_som_sv_gridss_for_joining = unfiltered_som_sv_gridss.map{ meta, j_supp -> [ meta.patient, j_supp ] } - jabba_w_svaba_inputs = jabba_inputs - .join(vcf_from_sv_calling_for_joining) - .join(unfiltered_som_sv_for_joining) + jabba_inputs = jabba_inputs + .join(vcf_from_sv_calling_gridss_for_joining) + .join(unfiltered_som_sv_gridss_for_joining) .map{ patient, meta, cov, hets, ploidy, seg, nseg, junction, j_supp -> [ meta, @@ -1469,57 +1419,16 @@ workflow NFJABBA { nseg ] } - - JABBA_WITH_SVABA( - jabba_w_svaba_inputs, - blacklist_junctions_jabba, - geno_jabba, - indel_jabba, - tfield_jabba, - iter_jabba, - rescue_window_jabba, - rescue_all_jabba, - nudgebalanced_jabba, - edgenudge_jabba, - strict_jabba, - allin_jabba, - field_jabba, - maxna_jabba, - blacklist_coverage_jabba, - purity_jabba, - pp_method_jabba, - cnsignif_jabba, - slack_jabba, - linear_jabba, - tilim_jabba, - epgap_jabba, - fix_thres_jabba, - lp_jabba, - ism_jabba, - filter_loose_jabba, - gurobi_jabba, - verbose_jabba - ) - - jabba_rds_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.jabba_rds) - jabba_gg_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.jabba_gg) - jabba_vcf_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.jabba_vcf) - jabba_raw_rds_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.jabba_raw_rds) - opti_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.opti) - jabba_seg_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.jabba_seg) - karyograph_with_svaba = Channel.empty().mix(JABBA_WITH_SVABA.out.karyograph) - versions_with_svaba = versions.mix(JABBA_WITH_SVABA.out.versions) - } - if (tools_used.contains('gridss')) { + else if (tools_used.contains('svaba')) { - vcf_from_sv_calling_gridss_for_joining = vcf_from_sv_calling_gridss.map{ meta, junction -> [ meta.patient, junction ] } - unfiltered_som_sv_gridss_for_joining = unfiltered_som_sv_gridss.map{ meta, j_supp -> [ meta.patient, j_supp ] } + vcf_from_sv_calling_for_joining = vcf_from_sv_calling.map{ meta, juction -> [ meta.patient, junction ] } + unfiltered_som_sv_for_joining = j_supp.map{ meta, j_supp -> [ meta.patient, j_supp ] } - jabba_w_gridss_inputs = jabba_inputs - .join(vcf_from_sv_calling_gridss_for_joining) - .join(unfiltered_som_sv_gridss_for_joining) + jabba_inputs = jabba_inputs + .join(vcf_from_sv_calling_for_joining) + .join(unfiltered_som_sv_for_joining) .map{ patient, meta, cov, hets, ploidy, seg, nseg, junction, j_supp -> [ meta, @@ -1532,93 +1441,48 @@ workflow NFJABBA { nseg ] } - - JABBA_WITH_GRIDSS( - jabba_w_gridss_inputs, - blacklist_junctions_jabba, - geno_jabba, - indel_jabba, - tfield_jabba, - iter_jabba, - rescue_window_jabba, - rescue_all_jabba, - nudgebalanced_jabba, - edgenudge_jabba, - strict_jabba, - allin_jabba, - field_jabba, - maxna_jabba, - blacklist_coverage_jabba, - purity_jabba, - pp_method_jabba, - cnsignif_jabba, - slack_jabba, - linear_jabba, - tilim_jabba, - epgap_jabba, - fix_thres_jabba, - lp_jabba, - ism_jabba, - filter_loose_jabba, - gurobi_jabba, - verbose_jabba - ) - - jabba_rds_with_gridss = Channel.empty().mix(JABBA_WITH_GRIDSS.out.jabba_rds) - jabba_gg_with_gridss = Channel.empty().mix(JABBA_WITH_GRIDSS.out.jabba_gg) - jabba_seg_with_gridss = Channel.empty().mix(JABBA_WITH_GRIDSS.out.jabba_seg) - karyograph_with_gridss = Channel.empty().mix(JABBA_WITH_GRIDSS.out.karyograph) - versions_with_gridss = versions.mix(JABBA_WITH_GRIDSS.out.versions) - - } - } - } - } - - if (runEvents) { - if (tools_used.contains('events')) { - if (params.step == 'events') { - EVENTS(input_sample, fasta) - versions = Channel.empty().mix(EVENTS.out.versions) - - events_output = Channel.empty().mix(EVENTS.out.events_output) - } else { - if (tools_used.contains('gridss')) { - EVENTS_WITH_GRIDSS(jabba_rds_with_gridss, fasta) - events_w_gridss_versions = Channel.empty().mix(EVENTS_WITH_GRIDSS.out.versions) - - events_w_gridss_output = Channel.empty().mix(EVENTS_WITH_GRIDSS.out.events_output) - } - if (tools_used.contains('svaba')) { - EVENTS_WITH_SVABA(jabba_rds_with_svaba, fasta) - events_w_svaba_versions = Channel.empty().mix(EVENTS_WITH_SVABA.out.versions) - - events_w_svaba_output = Channel.empty().mix(EVENTS_WITH_SVABA.out.events_output) } - } - } - } - - if (runFusions) { - if (tools_used.contains('fusions')) { - if (params.step == 'fusions') { - FUSIONS(input_sample, gencode_fusions) - fusions_output = Channel.empty().mix(FUSIONS.out.fusions_output) - versions = Channel.empty().mix(FUSIONS.out.versions) - } else { - if (tools_used.contains('gridss')) { - FUSIONS_WITH_GRIDSS(jabba_rds_with_gridss, gencode_fusions) - fusions_w_gridss_output = Channel.empty().mix(FUSIONS_WITH_GRIDSS.out.fusions_output) - fusions_w_gridss_versions = Channel.empty().mix(FUSIONS_WITH_GRIDSS.out.versions) + JABBA( + jabba_inputs, + blacklist_junctions_jabba, + geno_jabba, + indel_jabba, + tfield_jabba, + iter_jabba, + rescue_window_jabba, + rescue_all_jabba, + nudgebalanced_jabba, + edgenudge_jabba, + strict_jabba, + allin_jabba, + field_jabba, + maxna_jabba, + blacklist_coverage_jabba, + purity_jabba, + pp_method_jabba, + cnsignif_jabba, + slack_jabba, + linear_jabba, + tilim_jabba, + epgap_jabba, + fix_thres_jabba, + lp_jabba, + ism_jabba, + filter_loose_jabba, + gurobi_jabba, + verbose_jabba + ) - } - if (tools_used.contains('svaba')) { - FUSIONS_WITH_SVABA(jabba_rds_with_svaba, gencode_fusions) - fusions_w_svaba_output = Channel.empty().mix(FUSIONS_WITH_SVABA.out.fusions_output) - fusions_w_svaba_versions = Channel.empty().mix(FUSIONS_WITH_SVABA.out.versions) + jabba_rds = Channel.empty().mix(JABBA.out.jabba_rds) + jabba_gg = Channel.empty().mix(JABBA.out.jabba_gg) + jabba_vcf = Channel.empty().mix(JABBA.out.jabba_vcf) + jabba_raw_rds = Channel.empty().mix(JABBA.out.jabba_raw_rds) + opti = Channel.empty().mix(JABBA.out.opti) + jabba_seg = Channel.empty().mix(JABBA.out.jabba_seg) + karyograph = Channel.empty().mix(JABBA.out.karyograph) + versions = versions.mix(JABBA.out.versions) - } } } } @@ -1629,23 +1493,12 @@ workflow NFJABBA { non_integer_balance_inputs = input_sample.map{ meta, cov, hets, ggraph -> [ meta, ggraph, cov, hets ] } het_pileups_for_joining = input_sample.map{ meta, cov, hets, ggraph -> [ meta.patient, hets ] } } else { - if (tools_used.contains('gridss')) { - jabba_rds_with_gridss_for_joining = jabba_rds_with_gridss.map{ meta, rds -> [ meta.patient, rds ] } - non_integer_balance_inputs = meta_for_joining - .join(jabba_rds_with_gridss_for_joining) - .join(tumor_dryclean_cov_for_joining) - .join(het_pileups_for_joining) - .map{ patient, meta, rds, cov, hets -> [ meta, rds, cov, hets ] } - - } - if (tools_used.contains('svaba')) { - jabba_rds_with_svaba_for_joining = jabba_rds_with_svaba.map{ meta, rds -> [ meta.patient, rds ] } - non_integer_balance_inputs = meta_for_joining - .join(jabba_rds_with_svaba_for_joining) - .join(tumor_dryclean_cov_for_joining) - .join(het_pileups_for_joining) - .map{ patient, meta, rds, cov, hets -> [ meta, rds, cov, hets ] } - } + jabba_rds_for_joining = jabba_rds.map{ meta, rds -> [ meta.patient, rds ] } + non_integer_balance_inputs = meta_for_joining + .join(jabba_rds_for_joining) + .join(tumor_dryclean_cov_for_joining) + .join(het_pileups_for_joining) + .map{ patient, meta, rds, cov, hets -> [ meta, rds, cov, hets ] } } NON_INTEGER_BALANCE( @@ -1705,6 +1558,36 @@ workflow NFJABBA { lp_phased_balance_unphased_allelic_gg = Channel.empty().mix(LP_PHASED_BALANCE.out.lp_phased_balance_unphased_allelic_gg) } } + + if (runEvents) { + if (tools_used.contains('events')) { + if (params.step == 'events') { + events_input = input_sample + versions = Channel.empty().mix(EVENTS.out.versions) + } else { + events_input = lp_phased_balance_balanced_gg + } + + EVENTS(events_input, fasta) + events_versions = Channel.empty().mix(EVENTS.out.versions) + events_output = Channel.empty().mix(EVENTS.out.events_output) + } + } + + if (runFusions) { + if (tools_used.contains('fusions')) { + if (params.step == 'fusions') { + fusions_input = input_sample + } else { + fusions_input = lp_phased_balance_balanced_gg + } + + FUSIONS(fusions_input, gencode_fusions) + fusions_output = Channel.empty().mix(FUSIONS.out.fusions_output) + versions = Channel.empty().mix(FUSIONS.out.versions) + } + } + } /* From 79186bd4809985923309f47b332ae7196ea49ac7 Mon Sep 17 00:00:00 2001 From: shihabdider Date: Sat, 11 May 2024 15:28:43 -0400 Subject: [PATCH 3/3] --amend --- bin/lp_phased_balance.R | 1 + conf/modules/alleic_cn.config | 34 -------------------------------- conf/modules/allelic_cn.config | 36 ---------------------------------- conf/modules/events.config | 2 +- 4 files changed, 2 insertions(+), 71 deletions(-) delete mode 100644 conf/modules/alleic_cn.config diff --git a/bin/lp_phased_balance.R b/bin/lp_phased_balance.R index 145635a..0d8afd6 100644 --- a/bin/lp_phased_balance.R +++ b/bin/lp_phased_balance.R @@ -783,6 +783,7 @@ } if(!opt$from_maf) { + gg$meta$ploidy = ifelse(is.null(gg$meta$ploidy), NA, gg$meta$ploidy) jab = zitools:::gg2jab(gg, purity = gg$meta$purity, ploidy = gg$meta$ploidy) jab = jabba.alleles2(jab, hets.gr, verbose = TRUE, uncoupled = TRUE, marginal = opt$marginal) diff --git a/conf/modules/alleic_cn.config b/conf/modules/alleic_cn.config deleted file mode 100644 index 0fe7d71..0000000 --- a/conf/modules/alleic_cn.config +++ /dev/null @@ -1,34 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Config file for defining DSL2 per module options and publishing paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Available keys to override module options: - ext.args = Additional arguments appended to command in module. - ext.args2 = Second set of arguments appended to command in module (multi-tool modules). - ext.args3 = Third set of arguments appended to command in module (multi-tool modules). - ext.prefix = File name prefix for output files. - ext.when = When to run the module. ----------------------------------------------------------------------------------------- -*/ -// ALLEIC_CN configs - -process { - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/non_integer_balance/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/lp_phased_balance/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } -} diff --git a/conf/modules/allelic_cn.config b/conf/modules/allelic_cn.config index a5e459b..0fe7d71 100644 --- a/conf/modules/allelic_cn.config +++ b/conf/modules/allelic_cn.config @@ -23,24 +23,6 @@ process { ] } - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE_WITH_GRIDSS' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/non_integer_balance_with_gridss/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:NON_INTEGER_BALANCE_WITH_SVABA' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/non_integer_balance_with_svaba/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE' { ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } publishDir = [ @@ -49,22 +31,4 @@ process { pattern: "*{.rds*,.command.*}" ] } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE_WITH_GRIDSS' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/lp_phased_balance_with_gridss/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } - - withName: 'MSKILABORG_NFJABBA:NFJABBA:ALLEIC_CN:LP_PHASED_BALANCE_WITH_SVABA' { - ext.when = { params.tools && params.tools.split(',').contains('alleic_cn') } - publishDir = [ - mode: params.publish_dir_mode, - path: { "${params.outdir}/alleic_cn/lp_phased_balance_with_svaba/${meta.id}/" }, - pattern: "*{.rds*,.command.*}" - ] - } } diff --git a/conf/modules/events.config b/conf/modules/events.config index 46d6899..15e58dd 100644 --- a/conf/modules/events.config +++ b/conf/modules/events.config @@ -18,7 +18,7 @@ process { ext.when = { params.tools && params.tools.split(',').contains('events') } publishDir = [ mode: params.publish_dir_mode, - path: { "${params.outdir}/events/${meta.id}/" }, + path: { "${params.outdir}/Events/${meta.id}/" }, pattern: "*{.rds*,.command.*}" ] }