refactor: put allelic_cn before events calling

mskilab-org · May 8, 2024 · c661907 · c661907
1 parent 2b0deb6
commit c661907
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 256 deletions.
diff --git a/bin/ascat_seg.R b/bin/ascat_seg.R
@@ -1,6 +1,6 @@
 {
     library(optparse)
-    
+
     ## DO NOT FAIL SILENTLY
     options(error = function() {traceback(2); quit("no", 1)})
 
@@ -35,7 +35,7 @@
         opt = parse_args(parseobj)
 
         print(opt)
-        
+
         print(.libPaths())
         options(error=function()traceback(2))
 
@@ -59,7 +59,7 @@
     #'
     #' @param gg (gGraph) diploid gGraph (expect cn.low and cn.high as node metadata)
     #' @param verbose (logical) default FALSE
-    #' 
+    #'
     #' @return gGraph with field allele and cn
     diploid2haploid = function(gg, verbose = FALSE) {
         og.nodes.gr = gg$nodes$gr[, c("cn.low", "cn.high", "var.low", "var.high", "cn", "node.id", "nhets")]
@@ -286,7 +286,7 @@
                 ##                                                          log.p = T) +
                 ##                                                  pnbinom(y, mu = centers[tot.cn - j + 1],
                 ##                                                          size = centers[tot.cn - j + 1],
-                ##                                                          log.p = T))) 
+                ##                                                          log.p = T)))
                 ll = sapply(0:(floor(tot.cn/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) +
                                                                 ppois(y,centers[tot.cn-j+1],log.p = T)))
                 ll = ll - min(ll)
@@ -300,7 +300,7 @@
                         ##                                                          log.p = T) +
                         ##                                                  pnbinom(y, mu = centers[tot.cn - j],
                         ##                                                          size = centers[tot.cn - j] / 2,
-                        ##                                                          log.p = T))) 
+                        ##                                                          log.p = T)))
                         ll = sapply(0:(floor((tot.cn - 1)/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) +
                                                                     ppois(y,centers[tot.cn-j],log.p = T)))
                         ll = ll - min(ll)
@@ -315,7 +315,7 @@
                     ##                                                          log.p = T) +
                     ##                                                  pnbinom(y, mu = centers[tot.cn - j + 2],
                     ##                                                          size = centers[tot.cn - j + 2],
-                    ##                                                          log.p = T))) 
+                    ##                                                          log.p = T)))
                     ll = sapply(0:(floor((tot.cn + 1)/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) +
                                                                             ppois(y,centers[tot.cn-j+2],log.p = T)))
                     ll = ll - min(ll)
@@ -363,7 +363,7 @@
 
         ## ## ###########
         ## ## phasing
-        ## ## ########### 
+        ## ## ###########
 
         ## ## iterate through all reference junctions and apply (wishful thinking) heuristic
         ## ##
@@ -561,7 +561,7 @@
     #' @param agt.fname (character) path to sites.txt
     #' @param min.frac (numeric) between 0 and 1, min frequency in normal to count as het site
     #' @param max.frac (numeric) between 0 and 1, max frequency in normal to count as het site
-    #' 
+    #'
     #' @return allele gTrack
     grab.hets = function(agt.fname = NULL,
                         min.frac = 0.2,
@@ -605,7 +605,7 @@
         if (!file.exists(agt.fname)) {
             stop("invalid file")
         }
-        
+
         gr = readRDS(agt.fname)
 
         if (!inherits(gr, "GRanges")) {
@@ -748,6 +748,8 @@
                             median(ratio, na.rm = T)] > 0.8,
                     "XX",
                     "XY")
+    # handle single chr case as XY
+    gender = ifelse(!is.na(gender), gender, "XY")
     message("The gender of this sample: ", gender)
 
     message("Starting ASCAT!!")

diff --git a/bin/lp_phased_balance.R b/bin/lp_phased_balance.R
@@ -705,45 +705,51 @@
                                     dt2gr(hets.dt[, .(seqnames, start, end, strand = "*")]),
                                     return.type = "data.table")
 
-    nodes.ov.hets[, count := .N, by = query.id]
-
-    ## we should only segment relatively wide nodes with low copy number
-    nodes.ov.hets = nodes.ov.hets[count >= 100,]
-
-    nodes.to.segment = nodes.ov.hets[, unique(query.id)]
-
-    new.sl = seqlengths(gg)
-
-    segs = lapply(nodes.to.segment,
-                  function(qid) {
-                      message("Starting segmentation for : ", qid)
-                      hets.subset.dt = hets.dt[nodes.ov.hets[query.id == qid, subject.id],]
-                      cna = CNA(hets.subset.dt[, BAF],
-                                hets.subset.dt[, as.character(seqnames)],
-                                hets.subset.dt[, start],
-                                data.type = "logratio")
-                      seg = segment(smooth.CNA(cna),
-                                    alpha = 1e-5,
-                                    verbose = TRUE)
-                      utils::capture.output({seg_dt = print(seg); setDT(seg_dt)},
-                                            type = "output",
-                                            file = "/dev/null")
-                      out = seg2gr(seg_dt[!(is.na(seg.mean) | is.na(loc.start) | is.na(loc.end))], new.sl)
-                      out = gr.fix(out, new.sl, drop = T)
-                      ## get the number of hets per segment
-                      values(out)[, "nhets"] = out %N% hets.gr
-                      ## make sure the segment is on the order of high kbp
-                      out = out %Q% (nhets > 50)
-                      message("Number of segments: ", length(out))
-                      names(out) = NULL
-                      if (length(out) > 1) {
-                          return(gr.start(out[2:length(out)]))
-                      }
-                      return(GRanges())
-                      message("Finished!")
-                  })
-
-    segs = do.call(grbind, segs)
+    # handle case where nodes.ov.hets is empty
+    if (nrow(nodes.ov.hets) == 0) {
+        message("No nodes overlap with hets")
+        segs = NULL
+    } else {
+        nodes.ov.hets[, count := .N, by = query.id]
+
+        ## we should only segment relatively wide nodes with low copy number
+        nodes.ov.hets = nodes.ov.hets[count >= 100,]
+
+        nodes.to.segment = nodes.ov.hets[, unique(query.id)]
+
+        new.sl = seqlengths(gg)
+
+        segs = lapply(nodes.to.segment,
+                      function(qid) {
+                          message("Starting segmentation for : ", qid)
+                          hets.subset.dt = hets.dt[nodes.ov.hets[query.id == qid, subject.id],]
+                          cna = CNA(hets.subset.dt[, BAF],
+                                    hets.subset.dt[, as.character(seqnames)],
+                                    hets.subset.dt[, start],
+                                    data.type = "logratio")
+                          seg = segment(smooth.CNA(cna),
+                                        alpha = 1e-5,
+                                        verbose = TRUE)
+                          utils::capture.output({seg_dt = print(seg); setDT(seg_dt)},
+                                                type = "output",
+                                                file = "/dev/null")
+                          out = seg2gr(seg_dt[!(is.na(seg.mean) | is.na(loc.start) | is.na(loc.end))], new.sl)
+                          out = gr.fix(out, new.sl, drop = T)
+                          ## get the number of hets per segment
+                          values(out)[, "nhets"] = out %N% hets.gr
+                          ## make sure the segment is on the order of high kbp
+                          out = out %Q% (nhets > 50)
+                          message("Number of segments: ", length(out))
+                          names(out) = NULL
+                          if (length(out) > 1) {
+                              return(gr.start(out[2:length(out)]))
+                          }
+                          return(GRanges())
+                          message("Finished!")
+                      })
+
+        segs = do.call(grbind, segs)
+    }
 
     if (is.null(segs)) {
         message("No extra breakends")
@@ -789,6 +795,7 @@
         ##binstats.gg$edges$mark(reward = new.gg$edges$dt[match(binstats.gg$edges$dt$og.edge.id, edge.id), reward])
         binstats.gg$edges$mark(cnloh = new.gg$edges$dt[match(binstats.gg$edges$dt$og.edge.id, edge.id), cnloh])
         ## binstats.gg$edges$mark(cnloh = gg$edges$dt[match(binstats.gg$edges$dt$og.edge.id, edge.id), cnloh])
+        print('tracer2')
     } else {
         tmp = as.data.table(readRDS(opt$hets))
         tmp = rbind(tmp[, .(seqnames, start, end, strand = "*",

diff --git a/bin/non_integer_balance.R b/bin/non_integer_balance.R
@@ -154,7 +154,11 @@
     if ((opt$overwrite) | (!file.exists(binstats.gg.fn))) {
 
         message("Starting binstats")
-        binstats.gg = gGnome::binstats(jab, bins = cov, field = opt$field, lp = TRUE)
+        if (ncn.x == 0) { # handle no sex chr case by disabling loess
+            binstats.gg = gGnome::binstats(jab, bins = cov, field = opt$field, lp = TRUE, loess=FALSE)
+        } else {
+            binstats.gg = gGnome::binstats(jab, bins = cov, field = opt$field, lp = TRUE)
+        }
 
         ## save binstats
         saveRDS(binstats.gg, binstats.gg.fn)

diff --git a/tests/nextflow.config b/tests/nextflow.config
@@ -278,7 +278,7 @@ params {
         blacklist_coverage_jabba         = "${params.mski_base}/JaBbA/blacklist_coverage/hg19/maskA_re.rds"
         build_non_integer_balance        = "hg19"
         mask_non_integer_balance         = "${params.mski_base}/allelic_cn/non_integer_balance/hg19/mask_with_segdups.rds"
-        mask_lp_phased_balance           = "${params.mski_base}/JaBbA/blacklist_coverage/hg19/maskA_re.rds"
+        mask_lp_phased_balance           = "${params.mski_base}/allelic_cn/lp_phased_balance/lp_phased_balance_maskA_re.rds"
 	}
 	'GATK.GRCh38' {
             fasta                        = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta"