Skip to content

Commit

Permalink
Merge pull request #47 from EvolEcolGroup/use_position
Browse files Browse the repository at this point in the history
Use position when merging
  • Loading branch information
dramanica authored Jun 24, 2024
2 parents f40fc98 + 3a168ed commit 26bd65d
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 9 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9014
Version: 0.0.0.9015
Authors@R:
c(person("Evie", "Carter", role = c("aut")),
person("Andrea", "Manica", email = "[email protected]",
Expand Down
17 changes: 15 additions & 2 deletions R/rbind_dry_run.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
#' be used as template to flip the ones in `target` and/or
#' swap their order as necessary.
#' @param target either a [`gen_tibble`] object, or the path to the PLINK bim file
#' @param use_position boolean of whether a combination of chromosome and position
#' should be used for matching SNPs. By default, `rbind` uses the locus name,
#' so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes
#' are coded in the same way in both `gen_tibbles` (a mix of e.g. 'chr1', '1' or
#' 'chromosome1' can be the reasons if an unexpectedly large number variants
#' are dropped when merging).
#' @param flip_strand boolean on whether strand flipping should be checked to
#' match the two datasets. Ambiguous SNPs (i.e. A/T and C/G)
#' will also be removed. It defaults to FALSE
Expand All @@ -27,7 +33,7 @@
#' to align the two datasets, `target` data.frame)
#' @export

rbind_dry_run <- function(ref, target, flip_strand = FALSE,
rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE,
quiet = FALSE){
# create a data.frame with loci names, numeric_id, and alleles
# it requires a specific formatting to work
Expand All @@ -38,6 +44,13 @@ rbind_dry_run <- function(ref, target, flip_strand = FALSE,
# replace NA with "0" for missing allele to avoid subsetting headaches (NA does not play nice with subsetting)
ref_df$allele_alt[is.na(ref_df$allele_alt)]<-"0"
target_df$allele_alt[is.na(target_df$allele_alt)]<-"0"

# replace the names with a combination of chromosome and position
if (use_position){
target_df <- target_df %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
ref_df <- ref_df %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
}

# rename the alleles
ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
Expand All @@ -61,7 +74,7 @@ rbind_dry_run_df <- function(ref_df, target_df, flip_strand, quiet){
# reorder target_sub to match ref_sub
target_sub <- target_sub[match(ref_sub$name, target_sub$name),]
# we now have two data.frames with the same loci and in the same order
stopifnot(all.equal(target_sub$name,ref_sub$name))
stopifnot(all.equal(target_sub$name,ref_sub$name))

# fix any missing alleles
target_sub$missing_allele <- resolve_missing_alleles(missing_table = target_sub, other_table = ref_sub)
Expand Down
23 changes: 20 additions & 3 deletions R/rbind_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
#' @param as_is boolean determining whether the loci should be left as they are
#' before merging. If FALSE (the defaults), `rbind` will attempt to subset and
#' swap alleles as needed.
#' @param use_position boolean of whether a combination of chromosome and position
#' should be used for matching SNPs. By default, `rbind` uses the locus name,
#' so this is set to FALSE. When using 'use_position=TRUE', make sure chromosomes
#' are coded in the same way in both `gen_tibbles` (a mix of e.g. 'chr1', '1' or
#' 'chromosome1' can be the reasons if an unexpectedly large number variants
#' are dropped when merging).
#' @param flip_strand boolean on whether strand flipping should be checked to
#' match the two datasets. If this is set to TRUE, ambiguous SNPs (i.e. A/T and C/G)
#' will also be removed. It defaults to FALSE
Expand All @@ -31,7 +37,7 @@
#' as its backing file for the FBM)
#' @returns a [`gen_tibble`] with the merged data.
#' @export
rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position = FALSE,
quiet = FALSE, backingfile=NULL){
dots <- list(...)
if (length(dots)!=2){
Expand All @@ -57,10 +63,15 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
backingfile <- tempfile("gt_merged_",tmpdir = save_path, fileext = "")
}

# if we use position, we update the names of the loci
if (use_position){
show_loci(ref) <- show_loci(ref) %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
show_loci(target) <- show_loci(target) %>% mutate(name_old = .data$name, name = paste(.data$chromosome,.data$position, sep="_") )
}


report <- rbind_dry_run(ref = ref, target = target, flip_strand=flip_strand,
quiet = quiet)
quiet = quiet, use_position = use_position)
# now edit the gen_tibble objects
###########
# we start with the ref object
Expand All @@ -74,6 +85,9 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
new_ref_loci_tbl <- show_loci(ref)[order(report$ref$new_id,na.last=NA),]
# now we subset the SNP object
## in the snp object
if (nrow(new_ref_loci_tbl)==0){
stop("there are no loci in common between the two gen_tibbles")
}
ref_snp <- subset_bigSNP(attr(ref$genotypes,"bigsnp"),
loci_indices = new_ref_loci_tbl$big_index,
indiv_indices = vctrs::vec_data(ref$genotypes))
Expand Down Expand Up @@ -139,7 +153,10 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
vctrs::vec_data(ref$genotypes)
#and finally append the loci table
indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes))
new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
#browser()
#new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference

# TODO check that all individuals in tibble and bigsnp object are the same
merged_tbl$genotypes <- vctrs::new_vctr(match(indivs_with_big_names,merged_snp$fam$sample.ID), # TODO check that this is the correct order!!!!
bigsnp = merged_snp,
Expand Down
8 changes: 8 additions & 0 deletions man/rbind.gen_tbl.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion man/rbind_dry_run.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion tests/testthat/test_rbind_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ raw_path_pop_a <- system.file("extdata/pop_a.bed", package = "tidypopgen")
bigsnp_path_a <- bigsnpr::snp_readBed(raw_path_pop_a, backingfile = tempfile("test_a_"))
pop_a_gt <- gen_tibble(bigsnp_path_a, quiet=TRUE)
# #create merge
merged_gen <- rbind.gen_tbl(pop_b_gt, pop_a_gt, flip_strand = TRUE,
merged_gen <- rbind.gen_tbl(pop_b_gt, pop_a_gt, flip_strand = TRUE,
quiet = TRUE,
backingfile = tempfile())

Expand Down Expand Up @@ -41,3 +41,12 @@ test_that("merge combines datasets correctly",{
testthat::expect_false("rs307354" %in% loci_names(merged_gen))

})

test_that("merge by position works correctly",{
pop_a_renamed_gt <- pop_a_gt
show_loci(pop_a_renamed_gt)$name <- paste("new_name",1:count_loci(pop_a_renamed_gt),sep="_")
# if we bind by name we should get zero (or an error)
expect_error(rbind(pop_b_gt, pop_a_renamed_gt, quiet=TRUE), "there are no loci in common")
pos_merge <- rbind(pop_b_gt, pop_a_renamed_gt, flip_strand = TRUE, use_position = TRUE)
expect_true(all.equal(show_genotypes(merged_gen),show_genotypes(pos_merge)))
})
8 changes: 7 additions & 1 deletion vignettes/a01_overview.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,13 @@ merged_gt %>% show_loci()
```

Again, note that the `big_index` values have changed compared to the original files,
as we generated a new FBM with the merged data.
as we generated a new FBM with the merged data.

By default, `rbind` using the loci names to match the two datasets. The option
'use_position = TRUE' forces matching by chromosome and position; note that a common
source of problems when merging is the coding of chromosomes (e.g. a dataset has
'chromosome1' and the other 'chr1'). If using 'use_position = TRUE', you should
also make sure that the two datasets were aligned to the same reference genome assembly.

# Imputation

Expand Down

0 comments on commit 26bd65d

Please sign in to comment.