From 57bedfc311cd269da13e6f2559794e2b30d309a7 Mon Sep 17 00:00:00 2001 From: ShaiberAlon Date: Wed, 6 Oct 2021 14:58:22 -0400 Subject: [PATCH] make parse.grl work with seqnames that have "-" inside them (some proteins have such seqnames) ; make ";" as the only acceptable separator by default ; allow users to supply a vector with separator characters --- R/gUtils.R | 10 +++++++--- tests/testthat/test_rangeops.R | 6 ++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/R/gUtils.R b/R/gUtils.R index ba4fdf7..c10f97c 100644 --- a/R/gUtils.R +++ b/R/gUtils.R @@ -4128,18 +4128,22 @@ parse.gr = function(...) #' #' @param x character vector representing a GRangesList with UCSC style coordinates (chr:start-end[+-]) representing a [signed] Granges and ";" separators within each item of x separating individaul each GRAnges #' @param seqlengths named integer vector representing genome (default = hg_seqlengths()) +#' @param separator/s charachters used to separate between distinct ranges in each gr (default = c(';')). A single character could be provided or a vector that includes various optional separators #' @author Marcin Imielinski #' @return GRangesList parsed from IGV-/UCSC-style strings #' @export -parse.grl = function(x, seqlengths = hg_seqlengths(), meta = NULL) +parse.grl = function(x, seqlengths = hg_seqlengths(), meta = NULL, separators = c(';')) { nm = names(x) - tmp = strsplit(x, '\\s*[;\\,\\|]\\s*') + split_chr = paste0('\\s*[\\', paste(separators, collapse = '\\'), ']\\s*') + tmp = strsplit(x,split_chr) tmp.u = unlist(tmp) tmp.u = gsub('\\,', '', tmp.u) tmp.id = rep(1:length(tmp), sapply(tmp, length)) str = gsub('.*([\\+\\-])$','\\1', tmp.u) - spl = strsplit(tmp.u, "[\\-\\+\\:]", perl = T) + tmp.l = strsplit(tmp.u, ':', perl = T) # treat seqnames and ranges separately in order to work well with seqnames that contain "-" + l1 = lapply(tmp.l, function(s){strsplit(s[2], "[\\-\\+]", perl = T)[[1]]}) # split according to "-" and also get rid of the trailing "-" and "+" + spl = lapply(seq_along(tmp.l), function(ix){c(tmp.l[[ix]][1], l1[[ix]])}) if (any(ix <- sapply(spl, length)==2)){ spl[ix] = lapply(which(ix), function(x) spl[[x]][c(1:2,2)]) diff --git a/tests/testthat/test_rangeops.R b/tests/testthat/test_rangeops.R index 53b73f1..65d6d69 100644 --- a/tests/testthat/test_rangeops.R +++ b/tests/testthat/test_rangeops.R @@ -1429,6 +1429,12 @@ test_that("parse.grl", { expect_equal(width(grl_example[[2]][1]), 3000001) expect_equal(width(grl_example[[2]][2]), 79) + grl_example_alt_sep = parse.grl(c('chr1:1e6-5e6+,5:10-2000', 'chr2:2e6-5e6-|chr10:100231321-100231399'), + separators = c('|', ',')) + expect_equal(width(grl_example_alt_sep[[1]][1]), 4000001) + expect_equal(width(grl_example_alt_sep[[1]][2]), 1991) + expect_equal(width(grl_example_alt_sep[[2]][1]), 3000001) + expect_equal(width(grl_example_alt_sep[[2]][2]), 79) })