Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

debugging features #43

Merged
merged 10 commits into from
Jan 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ Remotes:
morinlab/GAMBLR.helpers
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
96 changes: 71 additions & 25 deletions R/annotate_sv.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,9 @@
#' @param return_as Stated format for returned output, default is "bedpe". Other accepted output formats are "bed" and "bedpe_entrez" (to keep entrez_ids for compatibility with portal.R and cBioPortal).
#' @param blacklist A vector of regions to be removed from annotations. Default coordinates are in respect to hg19.
#' @param genome_build Reference genome build parameter, default is grch37.
#' @param priority_to_be_oncogene Vector of gene names (default is `c("MYC", "BCL6")`) used to filter
#' rows based on genes that have the highest priority to be considered oncogenes. Genes to the left
#' (*i.e.* first elements of this vector) have higher priority; non-listed genes have the lowest priority.
#' See **Details** section for more information.
#' @param priority_to_be_oncogene Vector of gene names (default is `c("MYC", "BCL6")`) used to filter rows based on genes that have the highest priority to be considered oncogenes. Genes to the left (*i.e.* first elements of this vector) have higher priority; non-listed genes have the lowest priority. See **Details** section for more information.
#' @param oncogene_bed Optionally, provide bed for regions to be considered oncogenes.
#' @param include_shm_partners Set to TRUE if you want to allow aSHM regions to be considered valid oncogene partners when annotating SVs
#'
#' @return A data frame with annotated SVs (gene symbol and entrez ID).
#'
Expand All @@ -55,19 +54,31 @@ annotate_sv = function(sv_data,
return_as = "bedpe",
blacklist = c(60565248, 30303126, 187728894, 101357565, 101359747, 161734970, 69400840, 65217851, 187728889, 187728888,187728892, 187728893,188305164, 72551424, 72551425, 72551554, 72551555, 72551558, 72551559, 72551562, 189255425, 189255426, 189255461, 189255462),
genome_build = "grch37",
priority_to_be_oncogene = c("MYC", "BCL6")){

priority_to_be_oncogene = c("MYC", "BCL6"),
oncogene_bed,
include_shm_partners = FALSE){
if(!"SCORE" %in% colnames(sv_data)){
stop("input is missing required column 'SCORE'")
}
# remove duplicate rows in sv_data, if any
sv_data = unique(sv_data)

bedpe1 = sv_data %>%
dplyr::select("CHROM_A", "START_A", "END_A", "tumour_sample_id", ends_with("SCORE"), "STRAND_A")
dplyr::select(
"CHROM_A", "START_A", "END_A", "tumour_sample_id",
ends_with("SCORE"), "STRAND_A",
if ("ID" %in% colnames(sv_data)) "ID" else "manta_name"
)

bedpe2 = sv_data %>%
dplyr::select("CHROM_B", "START_B", "END_B", "tumour_sample_id", ends_with("SCORE"), "STRAND_B")
dplyr::select(
"CHROM_B", "START_B", "END_B", "tumour_sample_id",
ends_with("SCORE"), "STRAND_B",
if ("ID" %in% colnames(sv_data)) "ID" else "manta_name"
)

colnames(bedpe1) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand1")
colnames(bedpe2) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand2")
colnames(bedpe1) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand1", "ID")
colnames(bedpe2) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand2", "ID")
suppressWarnings({
if(any(grepl("chr", bedpe1$chrom))){
bedpe1 = dplyr::mutate(bedpe1, chrom = gsub("chr", "", chrom))
Expand All @@ -77,23 +88,59 @@ annotate_sv = function(sv_data,
if(missing(partner_bed)){
if(genome_build == "hg38"){
ig_regions = GAMBLR.data::hg38_partners
if(include_shm_partners){
extra_regions = GAMBLR.data::somatic_hypermutation_locations_GRCh38_v_latest %>%
filter(!gene %in% ig_regions$gene) %>%
dplyr::select(chr_name,hg38_start,hg38_end,gene) %>%
mutate(chr_name=str_remove(chr_name,"chr"))
extra_regions = extra_regions %>%
mutate(start=ifelse(hg38_start<hg38_end,hg38_start,hg38_end)) %>%
mutate(end=ifelse(hg38_start<hg38_end,hg38_end,hg38_start)) %>%
mutate(chrom = chr_name) %>%
dplyr::select(chrom,start,end,gene) %>%
mutate(start=start-5000,end=end+5000)

extra_regions$entrez = 0
ig_regions = bind_rows(extra_regions,ig_regions)
}
}else{
ig_regions = GAMBLR.data::grch37_partners
ig_regions = GAMBLR.data::grch37_partners

if(include_shm_partners){
extra_regions = GAMBLR.data::somatic_hypermutation_locations_GRCh37_v_latest %>%
filter(!gene %in% ig_regions$gene) %>%
dplyr::select(chr_name,hg19_start,hg19_end,gene) %>%
mutate(chr_name=str_remove(chr_name,"chr"))
extra_regions = extra_regions %>%
mutate(start=ifelse(hg19_start<hg19_end,hg19_start,hg19_end)) %>%
mutate(end=ifelse(hg19_start<hg19_end,hg19_end,hg19_start)) %>%
mutate(chrom = chr_name) %>%
dplyr::select(chrom,start,end,gene) %>%
mutate(start=start-5000,end=end+5000)

extra_regions$entrez = 0
ig_regions = bind_rows(extra_regions,ig_regions) %>% arrange(chrom)
}
}
}else{
ig_regions = partner_bed
if(!"entrez" %in% colnames(ig_regions)){
ig_regions$entrez = 0
}
}
if(genome_build == "hg38"){
oncogene_regions = GAMBLR.data::hg38_oncogene
if(missing(oncogene_bed)){
if(genome_build == "hg38"){
oncogene_regions = GAMBLR.data::hg38_oncogene
}else{
oncogene_regions = GAMBLR.data::grch37_oncogene
}
}else{
oncogene_regions = GAMBLR.data::grch37_oncogene
oncogene_regions = oncogene_bed
}

y = as.data.frame(oncogene_regions)

#use foverlaps to get oncogene SVs
#use overlaps to get oncogene SVs
a = as.data.frame(bedpe1)
a.onco = cool_overlaps(a, y, columns1 = c("chrom", "start", "end"), columns2 = c("chrom", "start", "end"), nomatch = TRUE) #oncogene-annotated bedpe for the first breakpoints
b = as.data.frame(bedpe2)
Expand All @@ -113,22 +160,22 @@ annotate_sv = function(sv_data,
b.ig = cool_overlaps(b.partner, y, type = "any", columns1 = c("chrom", "start", "end"), columns2 = c("chrom", "start", "end"), nomatch = TRUE)

a.ig <- a.ig %>%
mutate(id = row_number()) %>% # make sure the order is consistent
group_by(chrom, start, end, tumour_sample_id, score) %>%
mutate(row_id = row_number()) %>% # make sure the order is consistent
group_by(chrom, start, end, tumour_sample_id, score, ID) %>%
slice_head() %>%
ungroup %>%
arrange(id) %>%
arrange(row_id) %>%
as.data.frame() %>%
select(-id)
select(-row_id)

b.ig <- b.ig %>%
mutate(id = row_number()) %>% # make sure the order is consistent
group_by(chrom, start, end, tumour_sample_id, score) %>%
mutate(row_id = row_number()) %>% # make sure the order is consistent
group_by(chrom, start, end, tumour_sample_id, score, ID) %>%
slice_head() %>%
ungroup %>%
arrange(id) %>%
arrange(row_id) %>%
as.data.frame() %>%
select(-id)
select(-row_id)

a.ig = a.ig[,c("chrom", "start", "end", "strand2", "gene")]
b.ig = b.ig[,c("chrom", "start", "end", "strand1", "gene")]
Expand All @@ -143,9 +190,8 @@ annotate_sv = function(sv_data,
as.data.frame %>%
`rownames<-`(NULL)


all.annotated$fusion = dplyr::pull(tidyr::unite(all.annotated, fusion, partner, gene, sep = "-"), fusion)
all.annotated = dplyr::filter(all.annotated, !fusion %in% c("BCL6-BCL6", "CIITA-CIITA", "FOXP1-FOXP1"))
all.annotated = dplyr::filter(all.annotated, !fusion %in% c("BCL6-BCL6", "CIITA-CIITA", "FOXP1-FOXP1","PAX5-PAX5"))

all.annotated = dplyr::filter(all.annotated, !start1 %in% blacklist)
all.annotated = dplyr::filter(all.annotated, !start2 %in% blacklist)
Expand Down
6 changes: 4 additions & 2 deletions R/gene_to_region.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#' @param projection Reference genome build. Possible values are "grch37" (default) or "hg38".
#' @param return_as Specify the type of return. Default is "region" (chr:start-end), other acceptable arguments are "bed" and "df".
#' @param sort_regions A boolean parameter (TRUE is the default) indicating whether regions should be sorted by chomosome and start location.
#' @param pad_length Optionally, specify integer by how much to pad the region. Default 0 (no padding).
#'
#' @return A data frame, or a string with region(s) for the provided gene(s).
#'
Expand All @@ -29,7 +30,8 @@ gene_to_region = function(gene_symbol,
ensembl_id,
projection = "grch37",
return_as = "region",
sort_regions = TRUE){
sort_regions = TRUE,
pad_length = 0){

stopifnot('`projection` parameter must be "grch37" or "hg38"' = projection %in% c("grch37", "hg38"))
stopifnot('`return_as` parameter must be "region", "bed" or "df"' = return_as %in% c("region", "bed", "df"))
Expand Down Expand Up @@ -64,7 +66,7 @@ gene_to_region = function(gene_symbol,
region = dplyr::select(gene_coordinates, chromosome, start, end, gene_name, hugo_symbol, ensembl_gene_id) %>%
as.data.frame() %>%
dplyr::filter(chromosome %in% chr_select)

region = mutate(region, start = start - pad_length, end = end + pad_length)
if(sort_regions){
if(projection == "grch37"){
chrm_num = region$chromosome
Expand Down
61 changes: 55 additions & 6 deletions man/annotate_sv.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/gene_to_region.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading