Skip to content

Commit

Permalink
handle physical pos when reordering
Browse files Browse the repository at this point in the history
  • Loading branch information
dramanica committed Dec 13, 2024
1 parent 29c03c4 commit 2ed883a
Show file tree
Hide file tree
Showing 9 changed files with 113 additions and 18 deletions.
4 changes: 0 additions & 4 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -288,9 +288,6 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
backingfile <- change_duplicated_file_name(backingfile)
}

# use code for NA in FBM.256
# x[is.na(x)]<-3

bigsnp_obj <- gt_write_bigsnp_from_dfs(genotypes = x,
indiv_meta = indiv_meta,
loci = loci,
Expand Down Expand Up @@ -383,7 +380,6 @@ gt_write_bigsnp_from_dfs <- function(genotypes, indiv_meta, loci,
sex = 0,
affection = 0,
ploidy = ploidy)

map <- tibble(chromosome = loci$chromosome,
marker.ID = loci$name,
genetic.dist = as.double(loci$genetic_dist), ## make sure that genetic.dist is double
Expand Down
9 changes: 6 additions & 3 deletions R/gt_order_loci.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,23 @@
#' reordered; if TRUE, then the current loci table, which might have been reordered
#' manually, will be used, but only if the positions within each chromosome are
#' sequential
#' @param ignore_genetic_dist boolean to ignore the genetic distance when checking. Note
#' that, if `gentic_dist` are being ignored and they are not sorted, the function will
#' set them to zero to avoid problems with other software.
#' @param quiet boolean to suppress information about the files
#' @param ... other arguments
#' @return A [gen_tibble]
#' @export

gt_order_loci <- function(.x, use_current_table = FALSE, quiet = FALSE, ...){
gt_order_loci <- function(.x, use_current_table = FALSE, ignore_genetic_dist = TRUE, quiet = FALSE, ...){
if (use_current_table){
new_table <- show_loci(.x)
} else {
new_table <- show_loci(.x) %>% dplyr::arrange(.data$chr_int, .data$position)
show_loci(.x) <- new_table
}
# if asked to use the current table, check that it is ordered
is_loci_table_ordered(.x, error_on_false = TRUE)
gt_update_backingfile(.x, quiet=quiet, ...)
is_loci_table_ordered(.x, error_on_false = TRUE, ignore_genetic_dist = ignore_genetic_dist)
gt_update_backingfile(.x, quiet=quiet, ignore_genetic_dist = ignore_genetic_dist, ...)

}
17 changes: 16 additions & 1 deletion R/gt_update_backingfile.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@
#' for backing files used to store the data (they will be given a .bk
#' and .RDS automatically). If left to NULL (the default), the file name
#' will be based on the name f the current backing file.
#' @param ignore_genetic_dist boolean to ignore the genetic distance when checking. Note
#' that, if `gentic_dist` are being ignored and they are not sorted, the function will
#' set them to zero to avoid problems with other software.
#' @param chunk_size the number of loci to process at once
#' @param quiet boolean to suppress information about the files
#' @returns a [`gen_tibble`] with a backing file (i.e. a new File Backed Matrix)
#' @export

gt_update_backingfile <- function (.x, backingfile = NULL, chunk_size = NULL,
quiet = FALSE){
ignore_genetic_dist = TRUE, quiet = FALSE){
# if the backingfile is null, create a name based on the current backing file
if (is.null(backingfile)){
backingfile <- change_duplicated_file_name(gt_get_file_names(.x)[2])
Expand Down Expand Up @@ -69,6 +72,18 @@ gt_update_backingfile <- function (.x, backingfile = NULL, chunk_size = NULL,
# same for map
map <- attr(.x$genotypes,"bigsnp")$map
map <- map[.gt_bigsnp_cols(.x),]
# if we ignore genetic distance, set it to zero if it is not sorted
if (ignore_genetic_dist){
if (any(unlist(show_loci(.x) %>%
group_by(.data$chr_int) %>%
group_map(~ is.unsorted(.x$genetic_dist))))){
if (!quiet){
message("Genetic distances are not sorted, setting them to zero")

Check warning on line 81 in R/gt_update_backingfile.R

View check run for this annotation

Codecov / codecov/patch

R/gt_update_backingfile.R#L81

Added line #L81 was not covered by tests
}
show_loci(.x)$genetic_dist <- 0
}
}


# Create the bigSNP object
bigsnp_obj <- structure(list(genotypes = new_bk_matrix, fam = fam, map = map),
Expand Down
24 changes: 20 additions & 4 deletions R/is_loci_table_ordered.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,28 @@
#' or a [`gen_tibble`].
#' @param error_on_false logical, if `TRUE` an error is thrown if the loci
#' are not ordered.
#' @param ignore_physical logical, if `TRUE` the physical position is not checked.
#' @param ... other arguments passed to specific methods.
#' @returns a logical vector defining which loci are transversions
#' @rdname is_loci_table_ordered
#' @export
is_loci_table_ordered <- function(.x, error_on_false = FALSE, ...) {
is_loci_table_ordered <- function(.x, error_on_false = FALSE, ignore_genetic_dist = TRUE, ...) {
UseMethod("is_loci_table_ordered", .x)
}

#' @export
#' @rdname is_loci_table_ordered
is_loci_table_ordered.tbl_df <- function(.x, error_on_false = FALSE, ...) {
is_loci_table_ordered.tbl_df <- function(.x, error_on_false = FALSE, ignore_genetic_dist = TRUE, ...) {
#TODO this is a hack to deal with the class being dropped when going through group_map
stopifnot_gen_tibble(.x)
is_loci_table_ordered(.x$genotypes, error_on_false, ...)
is_loci_table_ordered(.x$genotypes, error_on_false = error_on_false,
ignore_genetic_dist = ignore_genetic_dist, ...)
}


#' @export
#' @rdname is_loci_table_ordered
is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ...) {
is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ignore_genetic_dist = TRUE, ...) {
rlang::check_dots_empty()

# check that within each chromosome positions are sorted
Expand All @@ -49,6 +51,20 @@ is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ...)
return(FALSE)
}
}

# check genetic distance
if (!ignore_genetic_dist){
if (any(unlist(show_loci(.x) %>%
group_by(.data$chr_int) %>%
group_map(~ is.unsorted(.x$genetic_dist))))){
if (error_on_false){
stop("Your genetic distances are not sorted within chromosomes")

Check warning on line 61 in R/is_loci_table_ordered.R

View check run for this annotation

Codecov / codecov/patch

R/is_loci_table_ordered.R#L61

Added line #L61 was not covered by tests
} else {
return(FALSE)
}
}
}

return(TRUE)
}

12 changes: 11 additions & 1 deletion man/gt_order_loci.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 11 additions & 1 deletion man/gt_update_backingfile.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 20 additions & 3 deletions man/is_loci_table_ordered.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,7 @@ test_that("chr_int is correct",{
chromosome_names <- c("1", "2", NA, "4")
expect_true(identical(c(1L,2L,NA,4L), cast_chromosome_to_int(chromosome_names)))
chromosome_names <- c("chr1", "chr2", NA, "chr4")
expect_true(identical(c(1L,2L,NA,3L), cast_chromosome_to_int(chromosome_names)))
expect_true(identical(c(1L,2L,NA,4L), cast_chromosome_to_int(chromosome_names)))
chromosome_names <- c("a", "b", NA, "c")
expect_true(identical(c(1L,2L,NA,3L), cast_chromosome_to_int(chromosome_names)))

Expand Down
28 changes: 28 additions & 0 deletions tests/testthat/test_gt_order_loci.R
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,34 @@ test_that("gt_order_loci use_current_table = TRUE",{
expect_error(gt_order_loci(test_gt, use_current_table = TRUE, quiet = TRUE),"Your loci are not sorted within chromosomes")
})

test_that("gt_order_loci catches physical positions out of sync",{

test_indiv_meta <- data.frame (id=c("a","b","c"),
population = c("pop1","pop1","pop2"))
test_genotypes <- rbind(c(1,1,0,1,1,0),
c(2,1,0,0,0,0),
c(2,2,0,0,1,1))
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,27,83)),
genetic_dist = as.double(c(0.3,0.7,0,0.2,0.0,0.3)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))

path <- tempfile()
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, backingfile = path)

expect_true(is_loci_table_ordered(test_gt))
expect_false(is_loci_table_ordered(test_gt, ignore_genetic_dist = FALSE))
# now order it
ordered_test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE)
# now we pass the test as we set genetic distances to zero
expect_true(is_loci_table_ordered(ordered_test_gt, ignore_genetic_dist = FALSE))
# check that genetic_dist is all zeroes
expect_equal(show_loci(ordered_test_gt)$genetic_dist, rep(0,6))
})


test_that("chr_int from character chromosomes",{
test_indiv_meta <- data.frame (id=c("a","b","c"),
population = c("pop1","pop1","pop2"))
Expand Down

0 comments on commit 2ed883a

Please sign in to comment.