Merge pull request #47 from EvolEcolGroup/use_position

Use position when merging
EvolEcolGroup · Jun 24, 2024 · 26bd65d · 26bd65d
2 parents f40fc98 + 3a168ed
commit 26bd65d
Show file tree

Hide file tree

Showing 7 changed files with 75 additions and 9 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidypopgen
 Title: Tidy Population Genetics
-Version: 0.0.0.9014
+Version: 0.0.0.9015
 Authors@R: 
     c(person("Evie", "Carter", role = c("aut")),
     person("Andrea", "Manica", email = "[email protected]", 

diff --git a/R/rbind_dry_run.R b/R/rbind_dry_run.R
@@ -12,6 +12,12 @@
 #' be used as template to flip the ones in `target` and/or
 #' swap their order as necessary.
 #' @param target either a [`gen_tibble`] object, or the path to the PLINK bim file
+#' @param use_position boolean of whether a combination of chromosome and position
+#' should be used for matching SNPs. By default, `rbind` uses the locus name,
+#' so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes
+#' are coded in the same way in both `gen_tibbles` (a mix of e.g. 'chr1', '1' or
+#' 'chromosome1' can be the reasons if an unexpectedly large number variants
+#' are dropped when merging).
 #' @param flip_strand boolean on whether strand flipping should be checked to
 #' match the two datasets. Ambiguous SNPs (i.e. A/T and C/G)
 #' will also be removed.  It defaults to FALSE
@@ -27,7 +33,7 @@
 #' to align the two datasets, `target` data.frame)
 #' @export
 
-rbind_dry_run <- function(ref, target, flip_strand = FALSE,
+rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE,
                           quiet = FALSE){
   # create a data.frame with loci names, numeric_id, and alleles
   # it requires a specific formatting to work
@@ -38,6 +44,13 @@ rbind_dry_run <- function(ref, target, flip_strand = FALSE,
   # replace NA with "0" for missing allele to avoid subsetting headaches (NA does not play nice with subsetting)
   ref_df$allele_alt[is.na(ref_df$allele_alt)]<-"0"
   target_df$allele_alt[is.na(target_df$allele_alt)]<-"0"
+
+  # replace the names with a combination of chromosome and position
+  if (use_position){
+    target_df <- target_df %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
+    ref_df <- ref_df %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
+  }
+
   # rename the alleles
   ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
   target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
@@ -61,7 +74,7 @@ rbind_dry_run_df <- function(ref_df, target_df,  flip_strand, quiet){
   # reorder target_sub to match ref_sub
   target_sub <- target_sub[match(ref_sub$name, target_sub$name),]
   # we now have two data.frames with the same loci and in the same order
- stopifnot(all.equal(target_sub$name,ref_sub$name))
+  stopifnot(all.equal(target_sub$name,ref_sub$name))
 
   # fix any missing alleles
   target_sub$missing_allele <- resolve_missing_alleles(missing_table = target_sub, other_table = ref_sub)

diff --git a/R/rbind_gen_tibble.R b/R/rbind_gen_tibble.R
@@ -22,6 +22,12 @@
 #' @param as_is boolean determining whether the loci should be left as they are
 #' before merging. If FALSE (the defaults), `rbind` will attempt to subset and
 #' swap alleles as needed.
+#' @param use_position boolean of whether a combination of chromosome and position
+#' should be used for matching SNPs. By default, `rbind` uses the locus name,
+#' so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes
+#' are coded in the same way in both `gen_tibbles` (a mix of e.g. 'chr1', '1' or
+#' 'chromosome1' can be the reasons if an unexpectedly large number variants
+#' are dropped when merging).
 #' @param flip_strand boolean on whether strand flipping should be checked to
 #' match the two datasets. If this is set to TRUE, ambiguous SNPs (i.e. A/T and C/G)
 #' will also be removed. It defaults to FALSE
@@ -31,7 +37,7 @@
 #' as its backing file for the FBM)
 #' @returns a [`gen_tibble`] with the merged data.
 #' @export
-rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
+rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position = FALSE,
               quiet = FALSE, backingfile=NULL){
   dots <- list(...)
   if (length(dots)!=2){
@@ -57,10 +63,15 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
     backingfile <- tempfile("gt_merged_",tmpdir = save_path, fileext = "")
   }
 
+  # if we use position, we update the names of the loci
+  if (use_position){
+    show_loci(ref) <- show_loci(ref) %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
+    show_loci(target) <- show_loci(target) %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
+  }
 
 
   report <- rbind_dry_run(ref = ref, target = target, flip_strand=flip_strand,
-                          quiet = quiet)
+                          quiet = quiet, use_position = use_position)
   # now edit the gen_tibble objects
   ###########
   # we start with the ref object
@@ -74,6 +85,9 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
   new_ref_loci_tbl <- show_loci(ref)[order(report$ref$new_id,na.last=NA),]
   # now we subset the SNP object
   ## in the snp object
+  if (nrow(new_ref_loci_tbl)==0){
+    stop("there are no loci in common between the two gen_tibbles")
+  }
   ref_snp <- subset_bigSNP(attr(ref$genotypes,"bigsnp"),
                            loci_indices = new_ref_loci_tbl$big_index,
                            indiv_indices = vctrs::vec_data(ref$genotypes))
@@ -139,7 +153,10 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
   vctrs::vec_data(ref$genotypes)
   #and finally append the loci table
   indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes))
-  new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
+  #browser()
+  #new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
+  new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference
+
   # TODO check that all individuals in tibble and bigsnp object are the same
   merged_tbl$genotypes <- vctrs::new_vctr(match(indivs_with_big_names,merged_snp$fam$sample.ID), # TODO check that this is the correct order!!!!
                   bigsnp = merged_snp,

diff --git a/man/rbind.gen_tbl.Rd b/man/rbind.gen_tbl.Rd
diff --git a/man/rbind_dry_run.Rd b/man/rbind_dry_run.Rd
diff --git a/tests/testthat/test_rbind_gen_tibble.R b/tests/testthat/test_rbind_gen_tibble.R
@@ -6,7 +6,7 @@ raw_path_pop_a <- system.file("extdata/pop_a.bed", package = "tidypopgen")
 bigsnp_path_a <- bigsnpr::snp_readBed(raw_path_pop_a, backingfile = tempfile("test_a_"))
 pop_a_gt <- gen_tibble(bigsnp_path_a, quiet=TRUE)
 # #create merge
- merged_gen <- rbind.gen_tbl(pop_b_gt, pop_a_gt, flip_strand = TRUE,
+merged_gen <- rbind.gen_tbl(pop_b_gt, pop_a_gt, flip_strand = TRUE,
                              quiet = TRUE,
                              backingfile = tempfile())
 
@@ -41,3 +41,12 @@ test_that("merge combines datasets correctly",{
   testthat::expect_false("rs307354" %in% loci_names(merged_gen))
 
 })
+
+test_that("merge by position works correctly",{
+  pop_a_renamed_gt <- pop_a_gt
+  show_loci(pop_a_renamed_gt)$name <- paste("new_name",1:count_loci(pop_a_renamed_gt),sep="_")
+  # if we bind by name we should get zero (or an error)
+  expect_error(rbind(pop_b_gt, pop_a_renamed_gt, quiet=TRUE), "there are no loci in common")
+  pos_merge <- rbind(pop_b_gt, pop_a_renamed_gt, flip_strand = TRUE, use_position = TRUE)
+  expect_true(all.equal(show_genotypes(merged_gen),show_genotypes(pos_merge)))
+})
diff --git a/vignettes/a01_overview.Rmd b/vignettes/a01_overview.Rmd
@@ -450,7 +450,13 @@ merged_gt %>% show_loci()
 ```
 
 Again, note that the `big_index` values have changed compared to the original files,
-as we generated a new FBM with the merged data. 
+as we generated a new FBM with the merged data.
+
+By default, `rbind` using the loci names to match the two datasets. The option
+'use_position = TRUE' forces matching by chromosome and position; note that a common
+source of problems when merging is the coding of chromosomes (e.g. a dataset has
+'chromosome1' and the other 'chr1'). If using 'use_position = TRUE', you should
+also make sure that the two datasets were aligned to the same reference genome assembly.
 
 # Imputation