diff --git a/DESCRIPTION b/DESCRIPTION index 4769c8eb..fc0e502b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidypopgen Title: Tidy Population Genetics -Version: 0.0.0.9014 +Version: 0.0.0.9015 Authors@R: c(person("Evie", "Carter", role = c("aut")), person("Andrea", "Manica", email = "am315@cam.ac.uk", diff --git a/R/rbind_dry_run.R b/R/rbind_dry_run.R index a1b71cce..c7e58498 100644 --- a/R/rbind_dry_run.R +++ b/R/rbind_dry_run.R @@ -12,6 +12,12 @@ #' be used as template to flip the ones in `target` and/or #' swap their order as necessary. #' @param target either a [`gen_tibble`] object, or the path to the PLINK bim file +#' @param use_position boolean of whether a combination of chromosome and position +#' should be used for matching SNPs. By default, `rbind` uses the locus name, +#' so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes +#' are coded in the same way in both `gen_tibbles` (a mix of e.g. 'chr1', '1' or +#' 'chromosome1' can be the reasons if an unexpectedly large number variants +#' are dropped when merging). #' @param flip_strand boolean on whether strand flipping should be checked to #' match the two datasets. Ambiguous SNPs (i.e. A/T and C/G) #' will also be removed. It defaults to FALSE @@ -27,7 +33,7 @@ #' to align the two datasets, `target` data.frame) #' @export -rbind_dry_run <- function(ref, target, flip_strand = FALSE, +rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE, quiet = FALSE){ # create a data.frame with loci names, numeric_id, and alleles # it requires a specific formatting to work @@ -38,6 +44,13 @@ rbind_dry_run <- function(ref, target, flip_strand = FALSE, # replace NA with "0" for missing allele to avoid subsetting headaches (NA does not play nice with subsetting) ref_df$allele_alt[is.na(ref_df$allele_alt)]<-"0" target_df$allele_alt[is.na(target_df$allele_alt)]<-"0" + + # replace the names with a combination of chromosome and position + if (use_position){ + target_df <- target_df %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") ) + ref_df <- ref_df %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") ) + } + # rename the alleles ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref") target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref") @@ -61,7 +74,7 @@ rbind_dry_run_df <- function(ref_df, target_df, flip_strand, quiet){ # reorder target_sub to match ref_sub target_sub <- target_sub[match(ref_sub$name, target_sub$name),] # we now have two data.frames with the same loci and in the same order - stopifnot(all.equal(target_sub$name,ref_sub$name)) + stopifnot(all.equal(target_sub$name,ref_sub$name)) # fix any missing alleles target_sub$missing_allele <- resolve_missing_alleles(missing_table = target_sub, other_table = ref_sub) diff --git a/R/rbind_gen_tibble.R b/R/rbind_gen_tibble.R index 916278d7..b91e7fec 100644 --- a/R/rbind_gen_tibble.R +++ b/R/rbind_gen_tibble.R @@ -22,6 +22,12 @@ #' @param as_is boolean determining whether the loci should be left as they are #' before merging. If FALSE (the defaults), `rbind` will attempt to subset and #' swap alleles as needed. +#' @param use_position boolean of whether a combination of chromosome and position +#' should be used for matching SNPs. By default, `rbind` uses the locus name, +#' so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes +#' are coded in the same way in both `gen_tibbles` (a mix of e.g. 'chr1', '1' or +#' 'chromosome1' can be the reasons if an unexpectedly large number variants +#' are dropped when merging). #' @param flip_strand boolean on whether strand flipping should be checked to #' match the two datasets. If this is set to TRUE, ambiguous SNPs (i.e. A/T and C/G) #' will also be removed. It defaults to FALSE @@ -31,7 +37,7 @@ #' as its backing file for the FBM) #' @returns a [`gen_tibble`] with the merged data. #' @export -rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, +rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position = FALSE, quiet = FALSE, backingfile=NULL){ dots <- list(...) if (length(dots)!=2){ @@ -57,10 +63,15 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, backingfile <- tempfile("gt_merged_",tmpdir = save_path, fileext = "") } + # if we use position, we update the names of the loci + if (use_position){ + show_loci(ref) <- show_loci(ref) %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") ) + show_loci(target) <- show_loci(target) %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") ) + } report <- rbind_dry_run(ref = ref, target = target, flip_strand=flip_strand, - quiet = quiet) + quiet = quiet, use_position = use_position) # now edit the gen_tibble objects ########### # we start with the ref object @@ -74,6 +85,9 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, new_ref_loci_tbl <- show_loci(ref)[order(report$ref$new_id,na.last=NA),] # now we subset the SNP object ## in the snp object + if (nrow(new_ref_loci_tbl)==0){ + stop("there are no loci in common between the two gen_tibbles") + } ref_snp <- subset_bigSNP(attr(ref$genotypes,"bigsnp"), loci_indices = new_ref_loci_tbl$big_index, indiv_indices = vctrs::vec_data(ref$genotypes)) @@ -139,7 +153,10 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, vctrs::vec_data(ref$genotypes) #and finally append the loci table indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes)) - new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!! + #browser() + #new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!! + new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference + # TODO check that all individuals in tibble and bigsnp object are the same merged_tbl$genotypes <- vctrs::new_vctr(match(indivs_with_big_names,merged_snp$fam$sample.ID), # TODO check that this is the correct order!!!! bigsnp = merged_snp, diff --git a/man/rbind.gen_tbl.Rd b/man/rbind.gen_tbl.Rd index bc73f2d9..548c17f9 100644 --- a/man/rbind.gen_tbl.Rd +++ b/man/rbind.gen_tbl.Rd @@ -8,6 +8,7 @@ ..., as_is = FALSE, flip_strand = FALSE, + use_position = FALSE, quiet = FALSE, backingfile = NULL ) @@ -24,6 +25,13 @@ swap alleles as needed.} match the two datasets. If this is set to TRUE, ambiguous SNPs (i.e. A/T and C/G) will also be removed. It defaults to FALSE} +\item{use_position}{boolean of whether a combination of chromosome and position +should be used for matching SNPs. By default, \code{rbind} uses the locus name, +so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes +are coded in the same way in both \code{gen_tibbles} (a mix of e.g. 'chr1', '1' or +'chromosome1' can be the reasons if an unexpectedly large number variants +are dropped when merging).} + \item{quiet}{boolean whether to omit reporting to screen} \item{backingfile}{the path and prefix of the files used to store the diff --git a/man/rbind_dry_run.Rd b/man/rbind_dry_run.Rd index aed15816..c357fde4 100644 --- a/man/rbind_dry_run.Rd +++ b/man/rbind_dry_run.Rd @@ -4,7 +4,13 @@ \alias{rbind_dry_run} \title{Generate a report of what would happen to each SNP in a merge} \usage{ -rbind_dry_run(ref, target, flip_strand = FALSE, quiet = FALSE) +rbind_dry_run( + ref, + target, + use_position = FALSE, + flip_strand = FALSE, + quiet = FALSE +) } \arguments{ \item{ref}{either a \code{\link{gen_tibble}} object, or the path to the PLINK bim file; @@ -14,6 +20,13 @@ swap their order as necessary.} \item{target}{either a \code{\link{gen_tibble}} object, or the path to the PLINK bim file} +\item{use_position}{boolean of whether a combination of chromosome and position +should be used for matching SNPs. By default, \code{rbind} uses the locus name, +so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes +are coded in the same way in both \code{gen_tibbles} (a mix of e.g. 'chr1', '1' or +'chromosome1' can be the reasons if an unexpectedly large number variants +are dropped when merging).} + \item{flip_strand}{boolean on whether strand flipping should be checked to match the two datasets. Ambiguous SNPs (i.e. A/T and C/G) will also be removed. It defaults to FALSE} diff --git a/tests/testthat/test_rbind_gen_tibble.R b/tests/testthat/test_rbind_gen_tibble.R index 7e0bb304..20966fe7 100644 --- a/tests/testthat/test_rbind_gen_tibble.R +++ b/tests/testthat/test_rbind_gen_tibble.R @@ -6,7 +6,7 @@ raw_path_pop_a <- system.file("extdata/pop_a.bed", package = "tidypopgen") bigsnp_path_a <- bigsnpr::snp_readBed(raw_path_pop_a, backingfile = tempfile("test_a_")) pop_a_gt <- gen_tibble(bigsnp_path_a, quiet=TRUE) # #create merge - merged_gen <- rbind.gen_tbl(pop_b_gt, pop_a_gt, flip_strand = TRUE, +merged_gen <- rbind.gen_tbl(pop_b_gt, pop_a_gt, flip_strand = TRUE, quiet = TRUE, backingfile = tempfile()) @@ -41,3 +41,12 @@ test_that("merge combines datasets correctly",{ testthat::expect_false("rs307354" %in% loci_names(merged_gen)) }) + +test_that("merge by position works correctly",{ + pop_a_renamed_gt <- pop_a_gt + show_loci(pop_a_renamed_gt)$name <- paste("new_name",1:count_loci(pop_a_renamed_gt),sep="_") + # if we bind by name we should get zero (or an error) + expect_error(rbind(pop_b_gt, pop_a_renamed_gt, quiet=TRUE), "there are no loci in common") + pos_merge <- rbind(pop_b_gt, pop_a_renamed_gt, flip_strand = TRUE, use_position = TRUE) + expect_true(all.equal(show_genotypes(merged_gen),show_genotypes(pos_merge))) +}) diff --git a/vignettes/a01_overview.Rmd b/vignettes/a01_overview.Rmd index 6b9232ba..291f294d 100644 --- a/vignettes/a01_overview.Rmd +++ b/vignettes/a01_overview.Rmd @@ -450,7 +450,13 @@ merged_gt %>% show_loci() ``` Again, note that the `big_index` values have changed compared to the original files, -as we generated a new FBM with the merged data. +as we generated a new FBM with the merged data. + +By default, `rbind` using the loci names to match the two datasets. The option +'use_position = TRUE' forces matching by chromosome and position; note that a common +source of problems when merging is the coding of chromosomes (e.g. a dataset has +'chromosome1' and the other 'chr1'). If using 'use_position = TRUE', you should +also make sure that the two datasets were aligned to the same reference genome assembly. # Imputation