Skip to content

Commit

Permalink
fix subsetting for imputed data
Browse files Browse the repository at this point in the history
Also avoid making changes to the original datasets when rbinding.
  • Loading branch information
dramanica committed Apr 19, 2024
1 parent b2430ba commit 3063418
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 20 deletions.
47 changes: 31 additions & 16 deletions R/subset_bigSNP.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,36 +19,51 @@ subset_bigSNP <- function(X, indiv_indices=NULL, loci_indices=NULL, swap_indices
if (is.null(loci_indices)){
loci_indices <- bigstatsr::cols_along(X)
}
new_map <- X$map
# first we swap the loci (if necessary)
new_map <- X$map[loci_indices, ]
new_map_ref <- new_map # a copy to use when swapping alleles

# save a copy of the FBM with only the columns and rows needed, in the order necessary
if (is.null(backingfile)){
backingfile <- tempfile()
}
new_geno <- bigstatsr::big_copy(X$genotypes, ind.row = indiv_indices,
ind.col = loci_indices, backingfile = backingfile)

# now we swap the loci (if necessary)
if (!is.null(swap_indices)){
# convert swap indices to the new order
swap_indices<- match(swap_indices,loci_indices)
swap_indices <- swap_indices[!is.na(swap_indices)]

# store the old code
code_ref <- new_geno$code256
# set a code that allows us to see both raw data and imputed data
new_geno$code256 <- 1:256
swap_locus <- function(x){
# note that we work on the raw bytes valus
x_new <- x
x_new[x==2]<-0
x_new[x==0]<-2
x_new[is.na(x_new)]<-3
x_new
x_new[x==3]<-1
x_new[x==1]<-3
x_new[x==5]<-7
x_new[x==7]<-5
as.raw(x_new-1)
}
# change the genotypes
for (i in swap_indices){
X$genotypes[,i]<-swap_locus(X$genotypes[,i])
new_geno[,i]<-swap_locus(new_geno[,i])
}
# now swap the alleles in the map
new_map$allele1[swap_indices]<-X$map$allele2[swap_indices]
new_map$allele2[swap_indices]<-X$map$allele1[swap_indices]
new_map$allele1[swap_indices]<-new_map_ref$allele2[swap_indices]
new_map$allele2[swap_indices]<-new_map_ref$allele1[swap_indices]
# now reset the code
new_geno$code256 <- code_ref
}

if (is.null(backingfile)){
backingfile <- tempfile(tmpdir = getOption("FBM.dir"))
}
# save a copy of the FBM with only the columns and rows needed, in the order necessary
new_geno <- bigstatsr::big_copy(X$genotypes, ind.row = indiv_indices,
ind.col = loci_indices, backingfile = backingfile)
# now reassemble
# Create the bigSNP object
snp.list <- structure(list(genotypes = new_geno,
fam = X$fam[indiv_indices, ],
map = new_map[loci_indices, ]),
map = new_map),
class = "bigSNP")

# save it and return the path of the saved object
Expand Down
35 changes: 31 additions & 4 deletions tests/testthat/test_subset_bigSNP.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,35 @@ test_that("subset_bigSNP correctly subsets the data", {
expect_true(pop_a_snp$map %>% dplyr::filter(marker.ID==swapped_locus1) %>% pull(allele1)==
new_snp$map %>% dplyr::filter(marker.ID==swapped_locus1) %>% pull(allele2))

##############################################################################
## check missing data
# add some missing data
pop_a_snp$genotypes[1,5]<-3
# check that this was successful
expect_true(is.na(pop_a_snp$genotypes[1,5]))
# now do the subset with swap
new_snp2 <- subset_bigSNP(pop_a_snp, indiv_indices = indiv_indices,
loci_indices = loci_indices,
swap_indices = swap_indices)
# check that we preserved the NA
expect_true(is.na(new_snp2$genotypes[2,5]))
# this should be identical to the original subset, except for the na value
new_geno_copy <- new_snp$genotypes[]
new_geno_copy[2,5]<-NA
expect_true(all.equal(new_snp2$genotypes[], new_geno_copy))
# now impute
pop_a_snp$genotypes <- bigsnpr::snp_fastImputeSimple(pop_a_snp$genotypes)
# check that it worked
expect_false(is.na(pop_a_snp$genotypes[1,5]))
# now subset like before
new_snp3 <- subset_bigSNP(pop_a_snp, indiv_indices = indiv_indices,
loci_indices = loci_indices,
swap_indices = swap_indices)
# check that we don't have an NA
expect_false(is.na(new_snp3$genotypes[2,5]))
# and that we flipped it correctly (it should be identical to the original subset)
expect_true(all.equal(new_snp3$genotypes[,5],new_snp$genotypes[,5]))
# but if we change the code, it is an imputed genotype
new_snp3$genotypes$code256<-bigsnpr::CODE_012
expect_true(is.na(new_snp3$genotypes[2,5]))
})




0 comments on commit 3063418

Please sign in to comment.