From 3063418c1835662a9915db9965cb97fdec93cf6f Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 19 Apr 2024 10:36:23 +0100 Subject: [PATCH] fix subsetting for imputed data Also avoid making changes to the original datasets when rbinding. --- R/subset_bigSNP.R | 47 +++++++++++++++++++---------- tests/testthat/test_subset_bigSNP.R | 35 ++++++++++++++++++--- 2 files changed, 62 insertions(+), 20 deletions(-) diff --git a/R/subset_bigSNP.R b/R/subset_bigSNP.R index 46eeea3a..8b4ef186 100644 --- a/R/subset_bigSNP.R +++ b/R/subset_bigSNP.R @@ -19,36 +19,51 @@ subset_bigSNP <- function(X, indiv_indices=NULL, loci_indices=NULL, swap_indices if (is.null(loci_indices)){ loci_indices <- bigstatsr::cols_along(X) } - new_map <- X$map - # first we swap the loci (if necessary) + new_map <- X$map[loci_indices, ] + new_map_ref <- new_map # a copy to use when swapping alleles + + # save a copy of the FBM with only the columns and rows needed, in the order necessary + if (is.null(backingfile)){ + backingfile <- tempfile() + } + new_geno <- bigstatsr::big_copy(X$genotypes, ind.row = indiv_indices, + ind.col = loci_indices, backingfile = backingfile) + + # now we swap the loci (if necessary) if (!is.null(swap_indices)){ + # convert swap indices to the new order + swap_indices<- match(swap_indices,loci_indices) + swap_indices <- swap_indices[!is.na(swap_indices)] + + # store the old code + code_ref <- new_geno$code256 + # set a code that allows us to see both raw data and imputed data + new_geno$code256 <- 1:256 swap_locus <- function(x){ + # note that we work on the raw bytes valus x_new <- x - x_new[x==2]<-0 - x_new[x==0]<-2 - x_new[is.na(x_new)]<-3 - x_new + x_new[x==3]<-1 + x_new[x==1]<-3 + x_new[x==5]<-7 + x_new[x==7]<-5 + as.raw(x_new-1) } # change the genotypes for (i in swap_indices){ - X$genotypes[,i]<-swap_locus(X$genotypes[,i]) + new_geno[,i]<-swap_locus(new_geno[,i]) } # now swap the alleles in the map - new_map$allele1[swap_indices]<-X$map$allele2[swap_indices] - new_map$allele2[swap_indices]<-X$map$allele1[swap_indices] + new_map$allele1[swap_indices]<-new_map_ref$allele2[swap_indices] + new_map$allele2[swap_indices]<-new_map_ref$allele1[swap_indices] + # now reset the code + new_geno$code256 <- code_ref } - if (is.null(backingfile)){ - backingfile <- tempfile(tmpdir = getOption("FBM.dir")) - } - # save a copy of the FBM with only the columns and rows needed, in the order necessary - new_geno <- bigstatsr::big_copy(X$genotypes, ind.row = indiv_indices, - ind.col = loci_indices, backingfile = backingfile) # now reassemble # Create the bigSNP object snp.list <- structure(list(genotypes = new_geno, fam = X$fam[indiv_indices, ], - map = new_map[loci_indices, ]), + map = new_map), class = "bigSNP") # save it and return the path of the saved object diff --git a/tests/testthat/test_subset_bigSNP.R b/tests/testthat/test_subset_bigSNP.R index 0c6b6b4b..c6afe262 100644 --- a/tests/testthat/test_subset_bigSNP.R +++ b/tests/testthat/test_subset_bigSNP.R @@ -27,8 +27,35 @@ test_that("subset_bigSNP correctly subsets the data", { expect_true(pop_a_snp$map %>% dplyr::filter(marker.ID==swapped_locus1) %>% pull(allele1)== new_snp$map %>% dplyr::filter(marker.ID==swapped_locus1) %>% pull(allele2)) + ############################################################################## + ## check missing data + # add some missing data + pop_a_snp$genotypes[1,5]<-3 + # check that this was successful + expect_true(is.na(pop_a_snp$genotypes[1,5])) + # now do the subset with swap + new_snp2 <- subset_bigSNP(pop_a_snp, indiv_indices = indiv_indices, + loci_indices = loci_indices, + swap_indices = swap_indices) + # check that we preserved the NA + expect_true(is.na(new_snp2$genotypes[2,5])) + # this should be identical to the original subset, except for the na value + new_geno_copy <- new_snp$genotypes[] + new_geno_copy[2,5]<-NA + expect_true(all.equal(new_snp2$genotypes[], new_geno_copy)) + # now impute + pop_a_snp$genotypes <- bigsnpr::snp_fastImputeSimple(pop_a_snp$genotypes) + # check that it worked + expect_false(is.na(pop_a_snp$genotypes[1,5])) + # now subset like before + new_snp3 <- subset_bigSNP(pop_a_snp, indiv_indices = indiv_indices, + loci_indices = loci_indices, + swap_indices = swap_indices) + # check that we don't have an NA + expect_false(is.na(new_snp3$genotypes[2,5])) + # and that we flipped it correctly (it should be identical to the original subset) + expect_true(all.equal(new_snp3$genotypes[,5],new_snp$genotypes[,5])) + # but if we change the code, it is an imputed genotype + new_snp3$genotypes$code256<-bigsnpr::CODE_012 + expect_true(is.na(new_snp3$genotypes[2,5])) }) - - - -