Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Pruning] Seismic #124

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
2 changes: 1 addition & 1 deletion R/converters.R
Original file line number Diff line number Diff line change
Expand Up @@ -909,7 +909,7 @@ read.juncs = function(rafile,
ra = readRDS(rafile)
## validity check written for "junctions" class
return(Junction$new(ra))
} else if (grepl('(.bedpe$)', rafile)){
} else if (grepl('bedpe(\\.gz)?$', rafile)){
ra.path = rafile
cols = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'name', 'score', 'str1', 'str2')

Expand Down
277 changes: 273 additions & 4 deletions R/eventCallers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1051,9 +1051,13 @@ annotate_walks = function(walks)
#' Shortcut to call all simple and complex event types on JaBbA graph using standard settings on all event callers.
#'
#' @param gg gGraph
#' @param verbose
#' @param mark
#' @param QRP annotate quasi reciprocal events
#' @param seismic (logical or character) if TRUE then annotates seismic amplifications using a graph-based approach, if 'Rosswog' then the Rosswog et al. algorithm is used and events are annotated as seismic.rosswog, and if 'both' then will annotate both "seismic" (graph-based) and "seismic.rosswog" (Rosswog et al. algorithm)
#' @return gGraph with nodes and edges annotated with complex events in their node and edge metadata and in the graph meta data field $events
#' @export
events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE)
events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE, seismic = FALSE)
{
gg = gg %>% simple(mark = TRUE)
if (verbose)
Expand Down Expand Up @@ -1088,7 +1092,20 @@ events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE)
if (verbose)
message('Finished qrp')
}


if (seismic == TRUE | seismic == 'both' | seismic == 'Rosswog'){
Rosswog = FALSE
if (seismic == 'Rosswog'){
Rosswog = TRUE
}
gg = gg %>% seismic(mark = TRUE, Rosswog = Rosswog)
if (verbose)
message('Finished seismic')
if (seismic == 'both'){
gg = gg %>% seismic(mark = TRUE, Rosswog = TRUE)
}
}

ev = rbind(
gg$meta$simple,
gg$meta$chromothripsis,
Expand All @@ -1108,6 +1125,16 @@ events = function(gg, verbose = TRUE, mark = FALSE, QRP = FALSE)
gg$meta$qrpmin,
gg$meta$qrpmix, fill = TRUE)[, ev.id := seq_len(.N)]
}
if (seismic == TRUE | seismic == 'both'){
ev = rbind(
ev,
gg$meta$seismic, fill = TRUE)[, ev.id := seq_len(.N)]
}
if (seismic == 'Rosswog'){
ev = rbind(
ev,
gg$meta$seismic.rosswog, fill = TRUE)[, ev.id := seq_len(.N)]
}

gg$set(events = ev)
return(gg)
Expand Down Expand Up @@ -2916,6 +2943,121 @@ qrp = function(gg, thresh = 1e6, max.small = 1e5,
return(gg)

}


#' @name seismic
#' @description
#' Identification of seismic amplicons according to the definitions in Rosswog et al.
#' should be used.
#'
#' @param gg gGraph
#' @param amp.thresh (numeric) multiply of rounded ploidy to use for detection of amplification (2). The threshold for amplification is set to >= rounded.ploidy * amp.thresh + 1 where rounded.ploidy is either 2 or 4 (determined using ploidy.thresh).
#' @param ploidy.thresh (numeric) threshold to use above which ploidy is considered 4 (and up to it, 2). Rosswog et al. used 2 and hence this is the default value.
#' @param min.internal (numeric) minimal number of internal junctions in a seismic amplification (14).
#' @param mc.cores (numeric) number of cores to use (1)
#' @param mark (logical) nodes and edges with color if they are included in a seismic amplification (TRUE)
#' @param mark.col (character) color to use (if mark set to TRUE) to mark edges and nodes (purple).
#' @param Rosswog (logical) run the original algorithm by Rosswog et al.
#' @param ... additional parameters for the Rosswog et al. algorithm (see help menu for gGnome::seismic_rosswog()). This is only relevant is Rosswog = TRUE.
#'
#' @return gg
#' @export
seismic = function(gg, amp.thresh = 2, ploidy.thresh = 2, min.internal = 14,
mc.cores = 1, mark = TRUE, mark.col = 'purple', Rosswog = FALSE,
...)
{
gg$nodes$mark(seismic = as.integer(NA))
gg$edges$mark(seismic = as.integer(NA))
gg$set(seismic = data.table())
if (!('cn' %in% names(mcols(gg$nodes$gr)))){
stop('In order to call seismic amplifications gGraph nodes must contain "cn" values')
}
if (is.null(gg$meta$ploidy)){
ploidy = gg$nodes$dt[!is.na(cn), sum(cn*as.numeric(width))/sum(as.numeric(width))]
} else {
ploidy = gg$meta$ploidy
}

if (isTRUE(Rosswog)){
message('Running the original algorithm by Rosswog et al.')
rosswog_calls = seismic_rosswog(gg, ploidy, ...)
gg = annotate_rosswog_calls(gg, rosswog_calls)
} else {

# following Rosswog et al. we use 5 as the threshold for samples with ploidy up to 2
# and 9 as the threshold for samples with ploidy above 3
amp.thresh = ifelse(ploidy <= ploidy.thresh, 2 * amp.thresh + 1, 4 * amp.thresh + 1)

amplified = (gg$nodes$dt$cn) >= amp.thresh

gg$clusters(amplified, mode = 'strong')

cids = unique(gg$nodes$dt$cluster)
cids = cids[!is.na(cids)]

edt = mclapply(cids, function(cid){
sg = gg[cluster == cid]
if (length(sg$edges[type == 'ALT']) < 2){
return(NULL)
}

sg.no.alt = sg$clone()
# mark regions with rid (region ID)
eids = sg.no.alt$edges$dt$type == 'REF'
sg.no.alt$clusters(j = eids)
sg$nodes$mark(rid = sg.no.alt$nodes$dt$cluster)

# for each ALT edge check if both left and right node are in the same region (i.e. it is internal)
internal = sg$edges[type == 'ALT']$left$dt$rid == sg$edges[type == 'ALT']$right$dt$rid

# if it is not internal then check if it falls on the edge of regions
ndt = sg$nodes$dt
left.ids = ndt[, min(node.id), by = rid]$V1
right.ids = ndt[, max(node.id), by = rid]$V1
edge.nodes = c(left.ids, right.ids)

flanking = (sg$edges[type == 'ALT']$left$dt$node.id %in% edge.nodes) & (sg$edges[type == 'ALT']$right$dt$node.id %in% edge.nodes)

other = !internal & !flanking

return(data.table(cid = cid, internal.junc = sum(internal), flanking.junc = sum(flanking), other.junc = sum(other)))
}, mc.cores = mc.cores)

edt = rbindlist(edt)

if (edt[,.N] > 0){
edt = edt[internal.junc >= min.internal]
cids = edt$cid
ev.ids = seq_along(cids)
if (length(cids) > 0){
for (ev.id in ev.ids){
# annotate all edges that have both breakpoints whithin the cluster
cid = cids[ev.id]
e.idx = which(gg$edges[type == 'ALT']$left$dt$cluster == cid & gg$edges[type == 'ALT']$right$dt$cluster == cid)
eids = gg$edges[type == 'ALT'][e.idx]$dt$edge.id
gg$edges[eids]$mark(seismic = ev.id)

# annotate nodes
gg$nodes[cluster == cid]$mark(seismic = ev.id)
}

names(ev.ids) = as.character(cids)
edt[, footprint := sapply(ev.ids[as.character(cid)], function(x){nodes2footprint.str(gg$nodes[seismic == x])})]
edt$type = 'seismic'
# change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster
setnames(edt, 'cid', 'strong')
gg$set(seismic = edt)
}
}
}

if (mark){
gg$edges[!is.na(seismic)]$mark(col = mark.col)
gg$nodes[!is.na(seismic)]$mark(col = mark.col)
}

return(gg)
}

#' @name events.to.gr
#' @title Extract event annotation as a GRanges
Expand All @@ -2935,8 +3077,135 @@ events.to.gr = function(gg){
warning('No events annotated in this gGraph')
return(GRanges())
}
ggrl = parse.grl(gg$meta$events$footprint)
mcols(ggrl) = gg$meta$events
return(events.dt.to.gr(gg$meta$events))
}

#' @name events.dt.to.gr
#' @title convert events annotated data.table to a GRanges
#'
#' @author Alon Shaiber
#' @param ev (data.table) events annotation data.table (gGraph$meta$events)
#' @return GRanges containing ranges of annotated events along with all metadata from gg$meta$events
#' @export
events.dt.to.gr = function(ev){
ggrl = parse.grl(ev$footprint)
mcols(ggrl) = ev
ggr = grl.unlist(ggrl)
return(ggr)
}


#' @name seismic_rosswog
#' @description
#' Identification of seismic amplicons using the algorithm provided by Rosswog et al. 2021
#'
#' @param gg gGraph
#' @param ploidy ploidy value
#' @param rosswog_dir path to a local clone of the github repository provided by Rosswog et al. (https://github.com/seismicon/SeismicAmplification)
#' @param chrBands GRanges object (see https://github.com/seismicon/SeismicAmplification for details). If nothing is provided then the hg19 version (with no chr prefix) is used.
#' @param minInternalSVs (numeric) see https://github.com/seismicon/SeismicAmplification (14 by default)
#' @param cnvTol (numeric) see https://github.com/seismicon/SeismicAmplification (5e3 by default)
#'
#' @return list(amplicons = GRanges, svs = data.table)
#' @export
seismic_rosswog = function(gg, ploidy = 2, rosswog_dir = NULL, chrBands = NULL,
minInternalSVs = 14, cnvTol = 5e3){
if (!is.null(rosswog_dir)){
if (dir.exists(rosswog_dir)){
rosswog_scripts = paste0(rosswog_dir, '/seismic_amplification_detection/seismic_amplification_detection.R')
if (file.exists(rosswog_scripts)){
source(rosswog_scripts)
breaks = gg$nodes$gr
if (!('cn' %in% names(mcols(gg$nodes$gr)))){
stop('In order to call seismic amplifications gGraph nodes must contain "cn" values')
}
alts = gg$junctions[type == 'ALT']
if (length(alts) == 0){
message('No junctions found in the graph so no seismic amplification')
return(list(amplicons = GRanges(), svs = data.table()))
}
jgrl = alts$grl
chrs = as.vector(unlist(seqnames(jgrl)))
starts = unlist(start(jgrl))
odd = seq(1,2*length(alts),2)
even = seq(2,2*length(alts),2)
jdt = data.table(chr1 = chrs[odd],
chr2 = chrs[even],
bp1 = starts[odd],
bp2 = starts[even],
edge.id = alts$dt$edge.id)
if (is.null(chrBands)){
message('No chrBands provided so using the default hg19 (with no chr prefix)')
chrBands = fread(paste0(rosswog_dir, '/seismic_amplification_detection/chromosome_bands_hg19.txt'))
chrBands = chrBands[, `:=`(seqnames = gsub('chr', '', chrom),
start = chromStart,
end = chromEnd)]
chrBands = chrBands[seqnames %in% seqlevels(breaks)]
bands = trim(gUtils::dt2gr(chrBands,
seqlengths = seqlengths(breaks)[intersect(chrBands$seqnames, seqlevels(breaks))]))
}
rosswog_calls = detect_seismic_amplification(cnv=breaks, sv=jdt, chrBands=bands,
ploidy = ploidy,
minInternalSVs = minInternalSVs,
cnvTol = cnvTol)
return(rosswog_calls)
}
}
}
}


#' @name annotate_rosswog_calls
#' @description
#' Takes the output of the Rosswog et al. seismic amplification detection algorithm
#' and annotates the gGraph accordingly
#'
#' @param gg gGraph
#' @param rosswog_calls the output of seismic_rosswog()
#' @param mark.as.rosswog (logical) if set to TRUE then seismic events are annotated as "seismic.rosswog" (TRUE), otherwise annotated as "seismic". This is meant to allow the user to annotate both graph-based "seismic" events and Rosswog et al. "seismic.rosswog" events
#' @return gg
annotate_rosswog_calls = function(gg, rosswog_calls, mark.as.rosswog = TRUE){
# mark edges and nodes
if (length(rosswog_calls$amplicons) > 0){
if (isTRUE(mark.as.rosswog)){
gg$edges[rosswog_calls$svs$edge.id]$mark(seismic.rosswog = rosswog_calls$svs$amplicon_id)
sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id'])
gg$nodes[sgr$node.id]$mark(seismic.rosswog = sgr$id)
} else {
gg$edges[rosswog_calls$svs$edge.id]$mark(seismic = rosswog_calls$svs$amplicon_id)
sgr = (gg$nodes$gr %*% rosswog_calls$amplicons[,'id'])
gg$nodes[sgr$node.id]$mark(seismic = sgr$id)
}
cols = c('id', 'nSegments', 'medianCN', 'maxCN', 'size_amplicon', 'nChrs_amplicon', 'nRegions_amplicon',
'medianCN_amplicon', 'cnSpan_amplicon', 'cnStates_amplicon', 'nSegments_amplicon',
'nSVs_amplicon', 'nSVsInternal_amplicon')
edt = gr2dt(rosswog_calls$amplicons %Q% (!duplicated(id)))[, ..cols]
setnames(edt, 'id', 'cid') # avoid using "id" field
name = ifelse(isTRUE(mark.as.rosswog), 'seismic.rosswog', 'seismic')
edt[, footprint := sapply(cid, function(x){nodes2footprint.str(gg$nodes[get(name) == x])})]
edt[, ev.id := .I]
edt$type = name
# change the cluster ID column to "strong" to emphesize that this is a strongly connected cluster
if (isTRUE(mark.as.rosswog)){
gg$set(seismic.rosswog = edt)
} else {
gg$set(seismic = edt)
}
}
return(gg)
}

#' @name nodes2footprint.str
#' @description
#' Takes a gNode and returns character footprint parsable by parse.gr
#'
#' @param n gNode
#' @param stripstrand (logical) string the strand information (TRUE)
#' @return character
nodes2footprint.str = function(n, stripstrand = TRUE){
footprint = n$footprint
if (stripstrand){
footprint = gr.stripstrand(footprint)
}
return(paste(gr.string(footprint), collapse = ';'))
}
15 changes: 12 additions & 3 deletions tests/testthat/test_gGnome_event_callers.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ library(gTrack)
gg.jabba = gG(jabba = system.file('extdata/hcc1954', 'jabba.rds', package="gGnome"))

test_that('event caller', {
## run events caller with qrp
############# not running this currently since it is erroring out (See: https://app.travis-ci.com/github/mskilab/gGnome/builds/237955222 )
gg.events = events(gg.jabba, QRP = TRUE)
## run events caller with qrp and with seismic
gg.events = events(gg.jabba, QRP = TRUE, seismic = TRUE)
expect_is(gg.events$meta$events, 'data.table')
expect_true(any(grepl('qrp', gg.events$meta$events$type)))
expect_true(any(grepl('seismic', gg.events$meta$events$type)))

## run events caller without qrp
gg.events.no.qrp = events(gg.jabba, QRP = FALSE)
Expand Down Expand Up @@ -109,3 +109,12 @@ test_that('microhomology', {
expect_error(microhomology(gg, fa))

})

test_that('seismic', {
# test the Rosswog et al. algorithm version
# clone the repo
seismic_dir = paste0(tempdir(), '/', 'SeismicAmplification')
system(paste0('git clone https://github.com/ShaiberAlon/SeismicAmplification.git ', seismic_dir))
s = seismic(gg.jabba, Rosswog = TRUE, rosswog_dir = seismic_dir)
expect_error(seismic(gG()))
})