From c352c3996290f999dc014be94d3004b5ee817d4c Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Fri, 7 Jan 2022 15:16:08 -0500 Subject: [PATCH 1/9] added seismic to event callers --- R/eventCallers.R | 119 ++++++++++++++++++++- tests/testthat/test_gGnome_event_callers.R | 6 +- 2 files changed, 120 insertions(+), 5 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index b340d71e..41dc6637 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -1053,7 +1053,7 @@ annotate_walks = function(walks) #' @param gg gGraph #' @return gGraph with nodes and edges annotated with complex events in their node and edge metadata and in the graph meta data field $events #' @export -events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE) +events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE, seismic = FALSE) { gg = gg %>% simple(mark = TRUE) if (verbose) @@ -1088,7 +1088,13 @@ events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE) if (verbose) message('Finished qrp') } - + + if (seismic){ + gg = gg %>% seismic(mark = TRUE) + if (verbose) + message('Finished seismic') + } + ev = rbind( gg$meta$simple, gg$meta$chromothripsis, @@ -1108,6 +1114,11 @@ events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE) gg$meta$qrpmin, gg$meta$qrpmix, fill = TRUE)[, ev.id := seq_len(.N)] } + if (seismic){ + ev = rbind( + ev, + gg$meta$seismic, fill = TRUE)[, ev.id := seq_len(.N)] + } gg$set(events = ev) return(gg) @@ -2917,3 +2928,107 @@ qrp = function(gg, thresh = 1e6, max.small = 1e5, } + +#' @name seismic +#' @description +#' Identification of seismic amplicons according to the definitions in Rosswog et al. +#' should be used. +#' +#' @param gg gGraph +#' @param amp.thresh (numeric) multiply of rounded ploidy to use for detection of amplification (2). The threshold for amplification is set to >= rounded.ploidy * amp.thresh + 1 where rounded.ploidy is either 2 or 4 (determined using ploidy.thresh). +#' @param ploidy.thresh (numeric) threshold to use above which ploidy is considered 4 (and up to it, 2). Rosswog et al. used 2 and hence this is the default value. +#' @param min.internal (numeric) minimal number of internal junctions in a seismic amplification (14). +#' @param mc.cores (numeric) number of cores to use (1) +#' @param mark (logical) nodes and edges with color if they are included in a seismic amplification (TRUE) +#' @param mark.col (character) color to use (if mark set to TRUE) to mark edges and nodes (purple). +#' +#' @return gg +#' @export +seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, + mc.cores = 1, mark = TRUE, mark.col = 'purple') +{ + + gg$nodes$mark(seismic = as.integer(NA)) + gg$edges$mark(seismic = as.integer(NA)) + gg$set(seismic = data.table()) + ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))] + + # following Rosswog et al. we use 5 as the threshold for samples with ploidy up to 2 + # and 9 as the threshold for samples with ploidy above 3 + amp.thresh = ifelse(ploidy <= ploidy.thresh, 2 * amp.thresh + 1, 4 * amp.thresh + 1) + + amplified = (gg$nodes$dt$cn) >= amp.thresh + + gg$clusters(amplified, mode = 'strong') + + cids = unique(gg$nodes$dt$cluster) + cids = cids[!is.na(cids)] + + edt = mclapply(cids, function(cid){ + sg = gg[cluster == cid] + if (length(sg$edges[type == 'ALT']) < 2){ + return(NULL) + } + + sg.no.alt = sg$clone() + # mark regions with rid (region ID) + eids = sg.no.alt$edges$dt$type == 'REF' + sg.no.alt$clusters(j = eids) + sg$nodes$mark(rid = sg.no.alt$nodes$dt$cluster) + + # for each ALT edge check if both left and right node are in the same region (i.e. it is internal) + internal = sg$edges[type == 'ALT']$left$dt$rid == sg$edges[type == 'ALT']$right$dt$rid + + # if it is not internal then check if it falls on the edge of regions + ndt = sg$nodes$dt + left.ids = ndt[, min(node.id), by = rid]$V1 + right.ids = ndt[, max(node.id), by = rid]$V1 + edge.nodes = c(left.ids, right.ids) + + flanking = (sg$edges[type == 'ALT']$left$dt$node.id %in% edge.nodes) & (sg$edges[type == 'ALT']$right$dt$node.id %in% edge.nodes) + + other = !internal & !flanking + + return(data.table(cid = cid, internal.junc = sum(internal), flanking.junc = sum(flanking), other.junc = sum(other))) + }, mc.cores = mc.cores) + + edt = rbindlist(edt) + + # ad-hoc helper function to use to get the footprint + nodes2footprint = function(n){ + footprint = gr.stripstrand(n$footprint) + return(paste(gr.string(footprint), collapse = ';')) + } + + if (edt[,.N] > 0){ + edt = edt[internal.junc >= min.internal] + cids = edt$cid + ev.ids = seq_along(cids) + if (length(cids) > 0){ + for (ev.id in ev.ids){ + # annotate all edges that have both breakpoints whithin the cluster + cid = cids[ev.id] + e.idx = which(gg$edges[type == 'ALT']$left$dt$cluster == cid & gg$edges[type == 'ALT']$right$dt$cluster == cid) + eids = gg$edges[type == 'ALT'][e.idx]$dt$edge.id + gg$edges[eids]$mark(seismic = ev.id) + + # annotate nodes + gg$nodes[cluster == cid]$mark(seismic = ev.id) + } + + names(ev.ids) = cids + edt[, footprint := lapply(ev.id[cid], function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt$type = 'seismic' + # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster + setnames(edt, 'cid', 'strong') + gg$set(seismic = edt) + } + } + + if (mark){ + gg$edges[!is.na(seismic)]$mark(col = mark.col) + gg$nodes[!is.na(seismic)]$mark(col = mark.col) + } + + return(gg) +} diff --git a/tests/testthat/test_gGnome_event_callers.R b/tests/testthat/test_gGnome_event_callers.R index 807d40b1..2d7a0afc 100644 --- a/tests/testthat/test_gGnome_event_callers.R +++ b/tests/testthat/test_gGnome_event_callers.R @@ -5,11 +5,11 @@ library(gTrack) gg.jabba = gG(jabba = system.file('extdata/hcc1954', 'jabba.rds', package="gGnome")) test_that('event caller', { - ## run events caller with qrp -############# not running this currently since it is erroring out (See: https://app.travis-ci.com/github/mskilab/gGnome/builds/237955222 ) - gg.events = events(gg.jabba, QRP = TRUE) + ## run events caller with qrp and with seismic + gg.events = events(gg.jabba, QRP = TRUE, seismic = TRUE) expect_is(gg.events$meta$events, 'data.table') expect_true(any(grepl('qrp', gg.events$meta$events$type))) + expect_true(any(grepl('seismic', gg.events$meta$events$type))) ## run events caller without qrp gg.events.no.qrp = events(gg.jabba, QRP = FALSE) From 6ecbfcd62aa4472130ba219dfd62a39dbe43b175 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Fri, 7 Jan 2022 15:52:36 -0500 Subject: [PATCH 2/9] small fix for seismic footprint --- R/eventCallers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index 41dc6637..6b29cb49 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -3017,7 +3017,7 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, } names(ev.ids) = cids - edt[, footprint := lapply(ev.id[cid], function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt[, footprint := sapply(ev.id[cid], function(x){nodes2footprint(gg$nodes[seismic == x])})] edt$type = 'seismic' # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster setnames(edt, 'cid', 'strong') From 4336149274fe9d5033ee185f08187c09f32d2e45 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Sat, 8 Jan 2022 20:54:16 -0500 Subject: [PATCH 3/9] fix typo for cases with more than one seismic --- R/eventCallers.R | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index 06f73178..b1e6fbc8 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -3017,7 +3017,7 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, } names(ev.ids) = cids - edt[, footprint := sapply(ev.id[cid], function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt[, footprint := sapply(ev.ids[cid], function(x){nodes2footprint(gg$nodes[seismic == x])})] edt$type = 'seismic' # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster setnames(edt, 'cid', 'strong') @@ -3051,8 +3051,19 @@ events.to.gr = function(gg){ warning('No events annotated in this gGraph') return(GRanges()) } - ggrl = parse.grl(gg$meta$events$footprint) - mcols(ggrl) = gg$meta$events + return(events.dt.to.gr(gg$meta$events)) +} + +#' @name events.dt.to.gr +#' @title convert events annotated data.table to a GRanges +#' +#' @author Alon Shaiber +#' @param ev (data.table) events annotation data.table (gGraph$meta$events) +#' @return GRanges containing ranges of annotated events along with all metadata from gg$meta$events +#' @export +events.dt.to.gr = function(ev){ + ggrl = parse.grl(ev$footprint) + mcols(ggrl) = ev ggr = grl.unlist(ggrl) return(ggr) } From 19e8adfdf5e89e57e1c70351c7fcd8ade1c61b69 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Sat, 8 Jan 2022 21:42:53 -0500 Subject: [PATCH 4/9] make sure indexes are charachters to avoid issues --- R/eventCallers.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index b1e6fbc8..14182108 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -3016,8 +3016,8 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, gg$nodes[cluster == cid]$mark(seismic = ev.id) } - names(ev.ids) = cids - edt[, footprint := sapply(ev.ids[cid], function(x){nodes2footprint(gg$nodes[seismic == x])})] + names(ev.ids) = as.character(cids) + edt[, footprint := sapply(ev.ids[as.character(cid)], function(x){nodes2footprint(gg$nodes[seismic == x])})] edt$type = 'seismic' # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster setnames(edt, 'cid', 'strong') From 5dfb7f26582cdba0df349da11faffc1cfef833b3 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Wed, 12 Jan 2022 16:38:35 -0500 Subject: [PATCH 5/9] also accept bedpe.gz for jJ --- R/converters.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/converters.R b/R/converters.R index b112eb58..61d58a92 100755 --- a/R/converters.R +++ b/R/converters.R @@ -909,7 +909,7 @@ read.juncs = function(rafile, ra = readRDS(rafile) ## validity check written for "junctions" class return(Junction$new(ra)) - } else if (grepl('(.bedpe$)', rafile)){ + } else if (grepl('bedpe(\\.gz)?$', rafile)){ ra.path = rafile cols = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'name', 'score', 'str1', 'str2') From 972550c1877372f1a714b042a07fbac721ee5258 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Wed, 12 Jan 2022 16:39:41 -0500 Subject: [PATCH 6/9] made some fixes for seismic() and also implemented a wrapper for the Rosswog et al. algorithm for detection of seismic amps --- R/eventCallers.R | 193 ++++++++++++++------- tests/testthat/test_gGnome_event_callers.R | 8 + 2 files changed, 137 insertions(+), 64 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index 14182108..118fd566 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -2941,87 +2941,114 @@ qrp = function(gg, thresh = 1e6, max.small = 1e5, #' @param mc.cores (numeric) number of cores to use (1) #' @param mark (logical) nodes and edges with color if they are included in a seismic amplification (TRUE) #' @param mark.col (character) color to use (if mark set to TRUE) to mark edges and nodes (purple). +#' @param Rosswog (logical) run the original algorithm by Rosswog et al. +#' @param ... additional parameters for the Rosswog et al. algorithm (see help menu for gGnome::seismic_rosswog()). This is only relevant is Rosswog = TRUE. #' #' @return gg #' @export seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, - mc.cores = 1, mark = TRUE, mark.col = 'purple') + mc.cores = 1, mark = TRUE, mark.col = 'purple', Rosswog = FALSE, + ...) { - + # ad-hoc helper function to use to get the footprint + nodes2footprint = function(n){ + footprint = gr.stripstrand(n$footprint) + return(paste(gr.string(footprint), collapse = ';')) + } gg$nodes$mark(seismic = as.integer(NA)) gg$edges$mark(seismic = as.integer(NA)) gg$set(seismic = data.table()) - ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))] - - # following Rosswog et al. we use 5 as the threshold for samples with ploidy up to 2 - # and 9 as the threshold for samples with ploidy above 3 - amp.thresh = ifelse(ploidy <= ploidy.thresh, 2 * amp.thresh + 1, 4 * amp.thresh + 1) - - amplified = (gg$nodes$dt$cn) >= amp.thresh - - gg$clusters(amplified, mode = 'strong') - - cids = unique(gg$nodes$dt$cluster) - cids = cids[!is.na(cids)] + if (is.null(gg$meta$ploidy)){ + ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))] + } else { + ploidy = gg$meta$ploidy + } - edt = mclapply(cids, function(cid){ - sg = gg[cluster == cid] - if (length(sg$edges[type == 'ALT']) < 2){ - return(NULL) + if (isTRUE(Rosswog)){ + message('Running the original algorithm by Rosswog et al.') + rosswog_calls = seismic_rosswog(gg, ploidy, ...) + # mark edges and nodes + if (length(rosswog_calls$amplicons) > 0){ + gg$edges[rosswog_calls$svs$edge.id]$mark(seismic = rosswog_calls$svs$amplicon_id) + sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id']) + gg$nodes[sgr$node.id]$mark(seismic = sgr$id) + cols = c('id', 'nSegments', 'medianCN', 'maxCN', 'size_amplicon', 'nChrs_amplicon', 'nRegions_amplicon', + 'medianCN_amplicon', 'cnSpan_amplicon', 'cnStates_amplicon', 'nSegments_amplicon', + 'nSVs_amplicon', 'nSVsInternal_amplicon') + edt = gr2dt(rosswog_calls$amplicons %Q% (!duplicated(id)))[, ..cols] + setnames(edt, 'id', 'cid') # avoid using "id" field + edt[, footprint := sapply(cid, function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt[, ev.id := .I] + edt$type = 'seismic' + # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster + gg$set(seismic = edt) } + } else { - sg.no.alt = sg$clone() - # mark regions with rid (region ID) - eids = sg.no.alt$edges$dt$type == 'REF' - sg.no.alt$clusters(j = eids) - sg$nodes$mark(rid = sg.no.alt$nodes$dt$cluster) - - # for each ALT edge check if both left and right node are in the same region (i.e. it is internal) - internal = sg$edges[type == 'ALT']$left$dt$rid == sg$edges[type == 'ALT']$right$dt$rid - - # if it is not internal then check if it falls on the edge of regions - ndt = sg$nodes$dt - left.ids = ndt[, min(node.id), by = rid]$V1 - right.ids = ndt[, max(node.id), by = rid]$V1 - edge.nodes = c(left.ids, right.ids) + # following Rosswog et al. we use 5 as the threshold for samples with ploidy up to 2 + # and 9 as the threshold for samples with ploidy above 3 + amp.thresh = ifelse(ploidy <= ploidy.thresh, 2 * amp.thresh + 1, 4 * amp.thresh + 1) - flanking = (sg$edges[type == 'ALT']$left$dt$node.id %in% edge.nodes) & (sg$edges[type == 'ALT']$right$dt$node.id %in% edge.nodes) + amplified = (gg$nodes$dt$cn) >= amp.thresh - other = !internal & !flanking + gg$clusters(amplified, mode = 'strong') - return(data.table(cid = cid, internal.junc = sum(internal), flanking.junc = sum(flanking), other.junc = sum(other))) - }, mc.cores = mc.cores) - - edt = rbindlist(edt) + cids = unique(gg$nodes$dt$cluster) + cids = cids[!is.na(cids)] - # ad-hoc helper function to use to get the footprint - nodes2footprint = function(n){ - footprint = gr.stripstrand(n$footprint) - return(paste(gr.string(footprint), collapse = ';')) - } - - if (edt[,.N] > 0){ - edt = edt[internal.junc >= min.internal] - cids = edt$cid - ev.ids = seq_along(cids) - if (length(cids) > 0){ - for (ev.id in ev.ids){ - # annotate all edges that have both breakpoints whithin the cluster - cid = cids[ev.id] - e.idx = which(gg$edges[type == 'ALT']$left$dt$cluster == cid & gg$edges[type == 'ALT']$right$dt$cluster == cid) - eids = gg$edges[type == 'ALT'][e.idx]$dt$edge.id - gg$edges[eids]$mark(seismic = ev.id) - - # annotate nodes - gg$nodes[cluster == cid]$mark(seismic = ev.id) + edt = mclapply(cids, function(cid){ + sg = gg[cluster == cid] + if (length(sg$edges[type == 'ALT']) < 2){ + return(NULL) } - names(ev.ids) = as.character(cids) - edt[, footprint := sapply(ev.ids[as.character(cid)], function(x){nodes2footprint(gg$nodes[seismic == x])})] - edt$type = 'seismic' - # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster - setnames(edt, 'cid', 'strong') - gg$set(seismic = edt) + sg.no.alt = sg$clone() + # mark regions with rid (region ID) + eids = sg.no.alt$edges$dt$type == 'REF' + sg.no.alt$clusters(j = eids) + sg$nodes$mark(rid = sg.no.alt$nodes$dt$cluster) + + # for each ALT edge check if both left and right node are in the same region (i.e. it is internal) + internal = sg$edges[type == 'ALT']$left$dt$rid == sg$edges[type == 'ALT']$right$dt$rid + + # if it is not internal then check if it falls on the edge of regions + ndt = sg$nodes$dt + left.ids = ndt[, min(node.id), by = rid]$V1 + right.ids = ndt[, max(node.id), by = rid]$V1 + edge.nodes = c(left.ids, right.ids) + + flanking = (sg$edges[type == 'ALT']$left$dt$node.id %in% edge.nodes) & (sg$edges[type == 'ALT']$right$dt$node.id %in% edge.nodes) + + other = !internal & !flanking + + return(data.table(cid = cid, internal.junc = sum(internal), flanking.junc = sum(flanking), other.junc = sum(other))) + }, mc.cores = mc.cores) + + edt = rbindlist(edt) + + if (edt[,.N] > 0){ + edt = edt[internal.junc >= min.internal] + cids = edt$cid + ev.ids = seq_along(cids) + if (length(cids) > 0){ + for (ev.id in ev.ids){ + # annotate all edges that have both breakpoints whithin the cluster + cid = cids[ev.id] + e.idx = which(gg$edges[type == 'ALT']$left$dt$cluster == cid & gg$edges[type == 'ALT']$right$dt$cluster == cid) + eids = gg$edges[type == 'ALT'][e.idx]$dt$edge.id + gg$edges[eids]$mark(seismic = ev.id) + + # annotate nodes + gg$nodes[cluster == cid]$mark(seismic = ev.id) + } + + names(ev.ids) = as.character(cids) + edt[, footprint := sapply(ev.ids[as.character(cid)], function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt$type = 'seismic' + # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster + setnames(edt, 'cid', 'strong') + gg$set(seismic = edt) + } } } @@ -3067,3 +3094,41 @@ events.dt.to.gr = function(ev){ ggr = grl.unlist(ggrl) return(ggr) } + +seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL, minInternalSVs = 14, cnvTol = 5e3){ + if (!is.null(rosswog_dir)){ + if (dir.exists(rosswog_dir)){ + rosswog_scripts = paste0(rosswog_dir, '/seismic_amplification_detection/seismic_amplification_detection.R') + if (file.exists(rosswog_scripts)){ + source(rosswog_scripts) + breaks = gg$nodes$gr + if (!('cn' %in% names(mcols(gg$nodes$gr)))){ + warning('In order to call seismic amplifications gGraph nodes must contain "cn" values') + return(list(amplicons = GRanges(), svs = data.table())) + } + jdt = gg$junctions[type == 'ALT']$dt[,-c('bp1', 'bp2')] + if (jdt[,.N] == 0){ + message('No junctions found in the graph so no seismic amplification') + return(list(amplicons = GRanges(), svs = data.table())) + } + setnames(jdt, c('start1', 'start2'), + c('bp1', 'bp2')) + if (is.null(chrBands)){ + message('No chrBands provided so using the default hg19 (with no chr prefix)') + chrBands = fread(paste0(rosswog_dir, '/seismic_amplification_detection/chromosome_bands_hg19.txt')) + chrBands = chrBands[, `:=`(seqnames = gsub('chr', '', chrom), + start = chromStart, + end = chromEnd)] + chrBands = chrBands[seqnames %in% seqlevels(breaks)] + bands = trim(gUtils::dt2gr(chrBands, + seqlengths = seqlengths(breaks)[intersect(chrBands$seqnames, seqlevels(breaks))])) + } + rosswog_calls = detect_seismic_amplification(cnv=breaks, sv=jdt, chrBands=bands, + ploidy = ploidy, + minInternalSVs = minInternalSVs, + cnvTol = cnvTol) + return(rosswog_calls) + } + } + } +} diff --git a/tests/testthat/test_gGnome_event_callers.R b/tests/testthat/test_gGnome_event_callers.R index 5f7334de..51e021c4 100644 --- a/tests/testthat/test_gGnome_event_callers.R +++ b/tests/testthat/test_gGnome_event_callers.R @@ -109,3 +109,11 @@ test_that('microhomology', { expect_error(microhomology(gg, fa)) }) + +test_that('seismic', { + # test the Rosswog et al. algorithm version + # clone the repo + seismic_dir = paste0(tempdir(), '/', 'SeismicAmplification') + system(paste0('git clone https://github.com/ShaiberAlon/SeismicAmplification.git ', seismic_dir)) + s = seismic(gg.jabba, Rosswog = TRUE, rosswog_dir = seismic_dir) +}) From 7601e99b4c7622cd884b5aa9ee84ce5cc4557044 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Wed, 12 Jan 2022 21:50:39 -0500 Subject: [PATCH 7/9] fix the way we construct the sv data.table for input to seismic amp detection --- R/eventCallers.R | 22 ++++++++++++++++------ tests/testthat/test_gGnome_event_callers.R | 1 + 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index 118fd566..e7eb7011 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -2958,6 +2958,9 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, gg$nodes$mark(seismic = as.integer(NA)) gg$edges$mark(seismic = as.integer(NA)) gg$set(seismic = data.table()) + if (!('cn' %in% names(mcols(gg$nodes$gr)))){ + stop('In order to call seismic amplifications gGraph nodes must contain "cn" values') + } if (is.null(gg$meta$ploidy)){ ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))] } else { @@ -3103,16 +3106,23 @@ seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL, source(rosswog_scripts) breaks = gg$nodes$gr if (!('cn' %in% names(mcols(gg$nodes$gr)))){ - warning('In order to call seismic amplifications gGraph nodes must contain "cn" values') - return(list(amplicons = GRanges(), svs = data.table())) + stop('In order to call seismic amplifications gGraph nodes must contain "cn" values') } - jdt = gg$junctions[type == 'ALT']$dt[,-c('bp1', 'bp2')] - if (jdt[,.N] == 0){ + alts = gg$junctions[type == 'ALT'] + if (length(alts) == 0){ message('No junctions found in the graph so no seismic amplification') return(list(amplicons = GRanges(), svs = data.table())) } - setnames(jdt, c('start1', 'start2'), - c('bp1', 'bp2')) + jgrl = alts$grl + chrs = as.vector(unlist(seqnames(jgrl))) + starts = unlist(start(jgrl)) + odd = seq(1,2*length(alts),2) + even = seq(2,2*length(alts),2) + jdt = data.table(chr1 = chrs[odd], + chr2 = chrs[even], + bp1 = starts[odd], + bp2 = starts[even], + edge.id = alts$dt$edge.id) if (is.null(chrBands)){ message('No chrBands provided so using the default hg19 (with no chr prefix)') chrBands = fread(paste0(rosswog_dir, '/seismic_amplification_detection/chromosome_bands_hg19.txt')) diff --git a/tests/testthat/test_gGnome_event_callers.R b/tests/testthat/test_gGnome_event_callers.R index 51e021c4..248c1451 100644 --- a/tests/testthat/test_gGnome_event_callers.R +++ b/tests/testthat/test_gGnome_event_callers.R @@ -116,4 +116,5 @@ test_that('seismic', { seismic_dir = paste0(tempdir(), '/', 'SeismicAmplification') system(paste0('git clone https://github.com/ShaiberAlon/SeismicAmplification.git ', seismic_dir)) s = seismic(gg.jabba, Rosswog = TRUE, rosswog_dir = seismic_dir) + expect_error(seismic(gG())) }) From 5c47cb97425cf4ed4cec2a4b12e6a2cc071172bc Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Tue, 18 Jan 2022 15:39:32 -0500 Subject: [PATCH 8/9] reorganize seismic related functions --- R/eventCallers.R | 83 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 61 insertions(+), 22 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index e7eb7011..a264f3d6 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -2950,11 +2950,6 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, mc.cores = 1, mark = TRUE, mark.col = 'purple', Rosswog = FALSE, ...) { - # ad-hoc helper function to use to get the footprint - nodes2footprint = function(n){ - footprint = gr.stripstrand(n$footprint) - return(paste(gr.string(footprint), collapse = ';')) - } gg$nodes$mark(seismic = as.integer(NA)) gg$edges$mark(seismic = as.integer(NA)) gg$set(seismic = data.table()) @@ -2970,22 +2965,7 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, if (isTRUE(Rosswog)){ message('Running the original algorithm by Rosswog et al.') rosswog_calls = seismic_rosswog(gg, ploidy, ...) - # mark edges and nodes - if (length(rosswog_calls$amplicons) > 0){ - gg$edges[rosswog_calls$svs$edge.id]$mark(seismic = rosswog_calls$svs$amplicon_id) - sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id']) - gg$nodes[sgr$node.id]$mark(seismic = sgr$id) - cols = c('id', 'nSegments', 'medianCN', 'maxCN', 'size_amplicon', 'nChrs_amplicon', 'nRegions_amplicon', - 'medianCN_amplicon', 'cnSpan_amplicon', 'cnStates_amplicon', 'nSegments_amplicon', - 'nSVs_amplicon', 'nSVsInternal_amplicon') - edt = gr2dt(rosswog_calls$amplicons %Q% (!duplicated(id)))[, ..cols] - setnames(edt, 'id', 'cid') # avoid using "id" field - edt[, footprint := sapply(cid, function(x){nodes2footprint(gg$nodes[seismic == x])})] - edt[, ev.id := .I] - edt$type = 'seismic' - # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster - gg$set(seismic = edt) - } + gg = annotate_rosswog_calls(gg, rosswog_calls) } else { # following Rosswog et al. we use 5 as the threshold for samples with ploidy up to 2 @@ -3098,7 +3078,22 @@ events.dt.to.gr = function(ev){ return(ggr) } -seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL, minInternalSVs = 14, cnvTol = 5e3){ + +#' @name seismic_rosswog +#' @description +#' Identification of seismic amplicons using the algorithm provided by Rosswog et al. 2021 +#' +#' @param gg gGraph +#' @param ploidy ploidy value +#' @param rosswog_dir path to a local clone of the github repository provided by Rosswog et al. (https://github.com/seismicon/SeismicAmplification) +#' @param chrBands GRanges object (see https://github.com/seismicon/SeismicAmplification for details). If nothing is provided then the hg19 version (with no chr prefix) is used. +#' @param minInternalSVs (numeric) see https://github.com/seismicon/SeismicAmplification (14 by default) +#' @param cnvTol (numeric) see https://github.com/seismicon/SeismicAmplification (5e3 by default) +#' +#' @return list(amplicons = GRanges, svs = data.table) +#' @export +seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL, + minInternalSVs = 14, cnvTol = 5e3){ if (!is.null(rosswog_dir)){ if (dir.exists(rosswog_dir)){ rosswog_scripts = paste0(rosswog_dir, '/seismic_amplification_detection/seismic_amplification_detection.R') @@ -3142,3 +3137,47 @@ seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL, } } } + + +#' @name annotate_rosswog_calls +#' @description +#' Takes the output of the Rosswog et al. seismic amplification detection algorithm +#' and annotates the gGraph accordingly +#' +#' @param gg gGraph +#' @param rosswog_calls the output of seismic_rosswog() +#' @return gg +annotate_rosswog_calls = function(gg, rosswog_calls){ + # mark edges and nodes + if (length(rosswog_calls$amplicons) > 0){ + gg$edges[rosswog_calls$svs$edge.id]$mark(seismic = rosswog_calls$svs$amplicon_id) + sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id']) + gg$nodes[sgr$node.id]$mark(seismic = sgr$id) + cols = c('id', 'nSegments', 'medianCN', 'maxCN', 'size_amplicon', 'nChrs_amplicon', 'nRegions_amplicon', + 'medianCN_amplicon', 'cnSpan_amplicon', 'cnStates_amplicon', 'nSegments_amplicon', + 'nSVs_amplicon', 'nSVsInternal_amplicon') + edt = gr2dt(rosswog_calls$amplicons %Q% (!duplicated(id)))[, ..cols] + setnames(edt, 'id', 'cid') # avoid using "id" field + edt[, footprint := sapply(cid, function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt[, ev.id := .I] + edt$type = 'seismic' + # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster + gg$set(seismic = edt) + } + return(gg) +} + +#' @name nodes2footprint.str +#' @description +#' Takes a gNode and returns character footprint parsable by parse.gr +#' +#' @param n gNode +#' @param stripstrand (logical) string the strand information (TRUE) +#' @return character +nodes2footprint.str = function(n, stripstrand = TRUE){ + footprint = n$footprint + if (stripstrand){ + footprint = gr.stripstrand(footprint) + } + return(paste(gr.string(footprint), collapse = ';')) +} From 642a612d2a08762113fd3215934c7a533576f3ae Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Thu, 20 Jan 2022 14:28:44 -0500 Subject: [PATCH 9/9] annotating seismic.rosswog for Rosswog et al. calls and also allowing events to annotate both graph-based calls and Rosswog et al. calls --- R/eventCallers.R | 50 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/R/eventCallers.R b/R/eventCallers.R index a264f3d6..889a8e23 100755 --- a/R/eventCallers.R +++ b/R/eventCallers.R @@ -1051,6 +1051,10 @@ annotate_walks = function(walks) #' Shortcut to call all simple and complex event types on JaBbA graph using standard settings on all event callers. #' #' @param gg gGraph +#' @param verbose +#' @param mark +#' @param QRP annotate quasi reciprocal events +#' @param seismic (logical or character) if TRUE then annotates seismic amplifications using a graph-based approach, if 'Rosswog' then the Rosswog et al. algorithm is used and events are annotated as seismic.rosswog, and if 'both' then will annotate both "seismic" (graph-based) and "seismic.rosswog" (Rosswog et al. algorithm) #' @return gGraph with nodes and edges annotated with complex events in their node and edge metadata and in the graph meta data field $events #' @export events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE, seismic = FALSE) @@ -1089,10 +1093,17 @@ events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE, seismic = FALSE message('Finished qrp') } - if (seismic){ - gg = gg %>% seismic(mark = TRUE) + if (seismic == TRUE | seismic == 'both' | seismic == 'Rosswog'){ + Rosswog = FALSE + if (seismic == 'Rosswog'){ + Rosswog = TRUE + } + gg = gg %>% seismic(mark = TRUE, Rosswog = Rosswog) if (verbose) message('Finished seismic') + if (seismic == 'both'){ + gg = gg %>% seismic(mark = TRUE, Rosswog = TRUE) + } } ev = rbind( @@ -1114,11 +1125,16 @@ events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE, seismic = FALSE gg$meta$qrpmin, gg$meta$qrpmix, fill = TRUE)[, ev.id := seq_len(.N)] } - if (seismic){ + if (seismic == TRUE | seismic == 'both'){ ev = rbind( ev, gg$meta$seismic, fill = TRUE)[, ev.id := seq_len(.N)] } + if (seismic == 'Rosswog'){ + ev = rbind( + ev, + gg$meta$seismic.rosswog, fill = TRUE)[, ev.id := seq_len(.N)] + } gg$set(events = ev) return(gg) @@ -3026,7 +3042,7 @@ seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14, } names(ev.ids) = as.character(cids) - edt[, footprint := sapply(ev.ids[as.character(cid)], function(x){nodes2footprint(gg$nodes[seismic == x])})] + edt[, footprint := sapply(ev.ids[as.character(cid)], function(x){nodes2footprint.str(gg$nodes[seismic == x])})] edt$type = 'seismic' # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster setnames(edt, 'cid', 'strong') @@ -3146,23 +3162,35 @@ seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL, #' #' @param gg gGraph #' @param rosswog_calls the output of seismic_rosswog() +#' @param mark.as.rosswog (logical) if set to TRUE then seismic events are annotated as "seismic.rosswog" (TRUE), otherwise annotated as "seismic". This is meant to allow the user to annotate both graph-based "seismic" events and Rosswog et al. "seismic.rosswog" events #' @return gg -annotate_rosswog_calls = function(gg, rosswog_calls){ +annotate_rosswog_calls = function(gg, rosswog_calls, mark.as.rosswog = TRUE){ # mark edges and nodes if (length(rosswog_calls$amplicons) > 0){ - gg$edges[rosswog_calls$svs$edge.id]$mark(seismic = rosswog_calls$svs$amplicon_id) - sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id']) - gg$nodes[sgr$node.id]$mark(seismic = sgr$id) + if (isTRUE(mark.as.rosswog)){ + gg$edges[rosswog_calls$svs$edge.id]$mark(seismic.rosswog = rosswog_calls$svs$amplicon_id) + sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id']) + gg$nodes[sgr$node.id]$mark(seismic.rosswog = sgr$id) + } else { + gg$edges[rosswog_calls$svs$edge.id]$mark(seismic = rosswog_calls$svs$amplicon_id) + sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id']) + gg$nodes[sgr$node.id]$mark(seismic = sgr$id) + } cols = c('id', 'nSegments', 'medianCN', 'maxCN', 'size_amplicon', 'nChrs_amplicon', 'nRegions_amplicon', 'medianCN_amplicon', 'cnSpan_amplicon', 'cnStates_amplicon', 'nSegments_amplicon', 'nSVs_amplicon', 'nSVsInternal_amplicon') edt = gr2dt(rosswog_calls$amplicons %Q% (!duplicated(id)))[, ..cols] setnames(edt, 'id', 'cid') # avoid using "id" field - edt[, footprint := sapply(cid, function(x){nodes2footprint(gg$nodes[seismic == x])})] + name = ifelse(isTRUE(mark.as.rosswog), 'seismic.rosswog', 'seismic') + edt[, footprint := sapply(cid, function(x){nodes2footprint.str(gg$nodes[get(name) == x])})] edt[, ev.id := .I] - edt$type = 'seismic' + edt$type = name # change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster - gg$set(seismic = edt) + if (isTRUE(mark.as.rosswog)){ + gg$set(seismic.rosswog = edt) + } else { + gg$set(seismic = edt) + } } return(gg) }