morinlab · Kdreval · Jan 20, 2025 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,5 +30,5 @@ Remotes:
     morinlab/GAMBLR.helpers
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Roxygen: list(markdown = TRUE)
diff --git a/R/annotate_sv.R b/R/annotate_sv.R
@@ -28,10 +28,9 @@
 #' @param return_as Stated format for returned output, default is "bedpe". Other accepted output formats are "bed" and "bedpe_entrez" (to keep entrez_ids for compatibility with portal.R and cBioPortal).
 #' @param blacklist A vector of regions to be removed from annotations. Default coordinates are in respect to hg19.
 #' @param genome_build Reference genome build parameter, default is grch37.
-#' @param priority_to_be_oncogene Vector of gene names (default is `c("MYC", "BCL6")`) used to filter 
-#' rows based on genes that have the highest priority to be considered oncogenes. Genes to the left 
-#' (*i.e.* first elements of this vector) have higher priority; non-listed genes have the lowest priority. 
-#' See **Details** section for more information.
+#' @param priority_to_be_oncogene Vector of gene names (default is `c("MYC", "BCL6")`) used to filter rows based on genes that have the highest priority to be considered oncogenes. Genes to the left (*i.e.* first elements of this vector) have higher priority; non-listed genes have the lowest priority. See **Details** section for more information.
+#' @param oncogene_bed Optionally, provide bed for regions to be considered oncogenes.
+#' @param include_shm_partners Set to TRUE if you want to allow aSHM regions to be considered valid oncogene partners when annotating SVs
 #' 
 #' @return A data frame with annotated SVs (gene symbol and entrez ID).
 #' 
@@ -55,19 +54,31 @@ annotate_sv = function(sv_data,
                        return_as = "bedpe",
                        blacklist = c(60565248, 30303126, 187728894, 101357565, 101359747, 161734970, 69400840, 65217851, 187728889, 187728888,187728892, 187728893,188305164, 72551424, 72551425, 72551554, 72551555, 72551558, 72551559, 72551562, 189255425, 189255426, 189255461, 189255462),
                        genome_build = "grch37",
-                       priority_to_be_oncogene = c("MYC", "BCL6")){
-
+                       priority_to_be_oncogene = c("MYC", "BCL6"),
+                       oncogene_bed,
+                       include_shm_partners = FALSE){
+  if(!"SCORE" %in% colnames(sv_data)){
+    stop("input is missing required column 'SCORE'")
+  }
   # remove duplicate rows in sv_data, if any
   sv_data = unique(sv_data)
 
   bedpe1 = sv_data %>%
-    dplyr::select("CHROM_A", "START_A", "END_A", "tumour_sample_id", ends_with("SCORE"), "STRAND_A")
+    dplyr::select(
+        "CHROM_A", "START_A", "END_A", "tumour_sample_id",
+        ends_with("SCORE"), "STRAND_A",
+        if ("ID" %in% colnames(sv_data)) "ID" else "manta_name"
+    )
 
   bedpe2 = sv_data %>%
-    dplyr::select("CHROM_B", "START_B", "END_B", "tumour_sample_id", ends_with("SCORE"), "STRAND_B")
+    dplyr::select(
+        "CHROM_B", "START_B", "END_B", "tumour_sample_id",
+        ends_with("SCORE"), "STRAND_B",
+        if ("ID" %in% colnames(sv_data)) "ID" else "manta_name"
+    )
 
-  colnames(bedpe1) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand1")
-  colnames(bedpe2) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand2")
+  colnames(bedpe1) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand1", "ID")
+  colnames(bedpe2) = c("chrom", "start", "end", "tumour_sample_id", "score", "strand2", "ID")
   suppressWarnings({
     if(any(grepl("chr", bedpe1$chrom))){
       bedpe1 = dplyr::mutate(bedpe1, chrom = gsub("chr", "", chrom))
@@ -77,23 +88,59 @@ annotate_sv = function(sv_data,
   if(missing(partner_bed)){
     if(genome_build == "hg38"){
       ig_regions = GAMBLR.data::hg38_partners
+      if(include_shm_partners){
+        extra_regions = GAMBLR.data::somatic_hypermutation_locations_GRCh38_v_latest %>% 
+          filter(!gene %in% ig_regions$gene) %>%
+          dplyr::select(chr_name,hg38_start,hg38_end,gene) %>%
+          mutate(chr_name=str_remove(chr_name,"chr"))
+        extra_regions = extra_regions %>% 
+          mutate(start=ifelse(hg38_start<hg38_end,hg38_start,hg38_end)) %>%
+          mutate(end=ifelse(hg38_start<hg38_end,hg38_end,hg38_start)) %>%
+          mutate(chrom = chr_name) %>%
+          dplyr::select(chrom,start,end,gene) %>%
+          mutate(start=start-5000,end=end+5000)
+
+        extra_regions$entrez = 0
+        ig_regions = bind_rows(extra_regions,ig_regions)
+      }
     }else{
-      ig_regions = GAMBLR.data::grch37_partners
+      ig_regions = GAMBLR.data::grch37_partners 
+
+      if(include_shm_partners){
+       extra_regions = GAMBLR.data::somatic_hypermutation_locations_GRCh37_v_latest %>% 
+        filter(!gene %in% ig_regions$gene) %>%
+        dplyr::select(chr_name,hg19_start,hg19_end,gene) %>%
+         mutate(chr_name=str_remove(chr_name,"chr"))
+       extra_regions = extra_regions %>% 
+         mutate(start=ifelse(hg19_start<hg19_end,hg19_start,hg19_end)) %>%
+         mutate(end=ifelse(hg19_start<hg19_end,hg19_end,hg19_start)) %>%
+         mutate(chrom = chr_name) %>%
+         dplyr::select(chrom,start,end,gene) %>%
+         mutate(start=start-5000,end=end+5000)
+
+       extra_regions$entrez = 0
+       ig_regions = bind_rows(extra_regions,ig_regions) %>% arrange(chrom)
+      }
     }
   }else{
     ig_regions = partner_bed
     if(!"entrez" %in% colnames(ig_regions)){
       ig_regions$entrez = 0
     }
   }
-  if(genome_build == "hg38"){
-    oncogene_regions = GAMBLR.data::hg38_oncogene
+  if(missing(oncogene_bed)){
+    if(genome_build == "hg38"){
+      oncogene_regions = GAMBLR.data::hg38_oncogene
+    }else{
+      oncogene_regions = GAMBLR.data::grch37_oncogene
+    }
   }else{
-    oncogene_regions = GAMBLR.data::grch37_oncogene
+    oncogene_regions = oncogene_bed
   }
+
   y = as.data.frame(oncogene_regions)
 
-  #use foverlaps to get oncogene SVs
+  #use overlaps to get oncogene SVs
   a = as.data.frame(bedpe1)
   a.onco = cool_overlaps(a, y, columns1 = c("chrom", "start", "end"), columns2 = c("chrom", "start", "end"), nomatch = TRUE) #oncogene-annotated bedpe for the first breakpoints
   b = as.data.frame(bedpe2)
@@ -113,22 +160,22 @@ annotate_sv = function(sv_data,
   b.ig = cool_overlaps(b.partner, y, type = "any", columns1 = c("chrom", "start", "end"), columns2 = c("chrom", "start", "end"), nomatch = TRUE)
 
   a.ig <- a.ig %>%
-    mutate(id = row_number()) %>% # make sure the order is consistent
-    group_by(chrom, start, end, tumour_sample_id, score) %>%
+    mutate(row_id = row_number()) %>% # make sure the order is consistent
+    group_by(chrom, start, end, tumour_sample_id, score, ID) %>%
     slice_head() %>%
     ungroup %>%
-    arrange(id) %>%
+    arrange(row_id) %>%
     as.data.frame() %>%
-    select(-id)
+    select(-row_id)
 
   b.ig <- b.ig %>%
-    mutate(id = row_number()) %>% # make sure the order is consistent
-    group_by(chrom, start, end, tumour_sample_id, score) %>%
+    mutate(row_id = row_number()) %>% # make sure the order is consistent
+    group_by(chrom, start, end, tumour_sample_id, score, ID) %>%
     slice_head() %>%
     ungroup %>%
-    arrange(id) %>%
+    arrange(row_id) %>%
     as.data.frame() %>%
-    select(-id)
+    select(-row_id)
 
   a.ig = a.ig[,c("chrom", "start", "end", "strand2", "gene")]
   b.ig = b.ig[,c("chrom", "start", "end", "strand1", "gene")]
@@ -143,9 +190,8 @@ annotate_sv = function(sv_data,
     as.data.frame %>%
     `rownames<-`(NULL)
 
-
   all.annotated$fusion = dplyr::pull(tidyr::unite(all.annotated, fusion, partner, gene, sep = "-"), fusion)
-  all.annotated = dplyr::filter(all.annotated, !fusion %in% c("BCL6-BCL6", "CIITA-CIITA", "FOXP1-FOXP1"))
+  all.annotated = dplyr::filter(all.annotated, !fusion %in% c("BCL6-BCL6", "CIITA-CIITA", "FOXP1-FOXP1","PAX5-PAX5"))
 
   all.annotated = dplyr::filter(all.annotated, !start1 %in% blacklist)
   all.annotated = dplyr::filter(all.annotated, !start2 %in% blacklist)

diff --git a/R/gene_to_region.R b/R/gene_to_region.R
@@ -12,6 +12,7 @@
 #' @param projection Reference genome build. Possible values are "grch37" (default) or "hg38".
 #' @param return_as Specify the type of return. Default is "region" (chr:start-end), other acceptable arguments are "bed" and "df".
 #' @param sort_regions A boolean parameter (TRUE is the default) indicating whether regions should be sorted by chomosome and start location.
+#' @param pad_length Optionally, specify integer by how much to pad the region. Default 0 (no padding).
 #'
 #' @return A data frame, or a string with region(s) for the provided gene(s).
 #'
@@ -29,7 +30,8 @@ gene_to_region = function(gene_symbol,
                           ensembl_id,
                           projection = "grch37",
                           return_as = "region",
-                          sort_regions = TRUE){
+                          sort_regions = TRUE,
+                          pad_length = 0){
 
   stopifnot('`projection` parameter must be "grch37" or "hg38"' = projection %in% c("grch37", "hg38"))
   stopifnot('`return_as` parameter must be "region", "bed" or "df"' = return_as %in% c("region", "bed", "df"))
@@ -64,7 +66,7 @@ gene_to_region = function(gene_symbol,
   region = dplyr::select(gene_coordinates, chromosome, start, end, gene_name, hugo_symbol, ensembl_gene_id) %>%
     as.data.frame() %>% 
     dplyr::filter(chromosome %in% chr_select)
-
+  region = mutate(region, start = start - pad_length, end = end + pad_length)
   if(sort_regions){
     if(projection == "grch37"){
       chrm_num = region$chromosome

diff --git a/man/annotate_sv.Rd b/man/annotate_sv.Rd
diff --git a/man/gene_to_region.Rd b/man/gene_to_region.Rd