From 3063418c1835662a9915db9965cb97fdec93cf6f Mon Sep 17 00:00:00 2001
From: Andrea Manica <am315@cam.ac.uk>
Date: Fri, 19 Apr 2024 10:36:23 +0100
Subject: [PATCH] fix subsetting for imputed data

Also avoid making changes to the original datasets when rbinding.
---
 R/subset_bigSNP.R                   | 47 +++++++++++++++++++----------
 tests/testthat/test_subset_bigSNP.R | 35 ++++++++++++++++++---
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/R/subset_bigSNP.R b/R/subset_bigSNP.R
index 46eeea3a..8b4ef186 100644
--- a/R/subset_bigSNP.R
+++ b/R/subset_bigSNP.R
@@ -19,36 +19,51 @@ subset_bigSNP <- function(X, indiv_indices=NULL, loci_indices=NULL, swap_indices
   if (is.null(loci_indices)){
     loci_indices <- bigstatsr::cols_along(X)
   }
-  new_map <- X$map
-  # first we swap the loci (if necessary)
+  new_map <- X$map[loci_indices, ]
+  new_map_ref <- new_map # a copy to use when swapping alleles
+
+  # save a copy of the FBM with only the columns and rows needed, in the order necessary
+  if (is.null(backingfile)){
+    backingfile <- tempfile()
+  }
+  new_geno <- bigstatsr::big_copy(X$genotypes, ind.row = indiv_indices,
+                                  ind.col = loci_indices, backingfile = backingfile)
+
+  # now we swap the loci (if necessary)
   if (!is.null(swap_indices)){
+    # convert swap indices to the new order
+    swap_indices<- match(swap_indices,loci_indices)
+    swap_indices <- swap_indices[!is.na(swap_indices)]
+
+    # store the old code
+    code_ref <- new_geno$code256
+    # set a code that allows us to see both raw data and imputed data
+    new_geno$code256 <- 1:256
     swap_locus <- function(x){
+      # note that we work on the raw bytes valus
       x_new <- x
-      x_new[x==2]<-0
-      x_new[x==0]<-2
-      x_new[is.na(x_new)]<-3
-      x_new
+      x_new[x==3]<-1
+      x_new[x==1]<-3
+      x_new[x==5]<-7
+      x_new[x==7]<-5
+      as.raw(x_new-1)
     }
     # change the genotypes
     for (i in swap_indices){
-      X$genotypes[,i]<-swap_locus(X$genotypes[,i])
+      new_geno[,i]<-swap_locus(new_geno[,i])
     }
     # now swap the alleles in the map
-    new_map$allele1[swap_indices]<-X$map$allele2[swap_indices]
-    new_map$allele2[swap_indices]<-X$map$allele1[swap_indices]
+    new_map$allele1[swap_indices]<-new_map_ref$allele2[swap_indices]
+    new_map$allele2[swap_indices]<-new_map_ref$allele1[swap_indices]
+    # now reset the code
+    new_geno$code256 <- code_ref
   }
 
-  if (is.null(backingfile)){
-    backingfile <- tempfile(tmpdir = getOption("FBM.dir"))
-  }
-  # save a copy of the FBM with only the columns and rows needed, in the order necessary
-  new_geno <- bigstatsr::big_copy(X$genotypes, ind.row = indiv_indices,
-                      ind.col = loci_indices, backingfile = backingfile)
   # now reassemble
   # Create the bigSNP object
   snp.list <- structure(list(genotypes = new_geno,
                              fam = X$fam[indiv_indices, ],
-                             map = new_map[loci_indices, ]),
+                             map = new_map),
                             class = "bigSNP")
 
   # save it and return the path of the saved object
diff --git a/tests/testthat/test_subset_bigSNP.R b/tests/testthat/test_subset_bigSNP.R
index 0c6b6b4b..c6afe262 100644
--- a/tests/testthat/test_subset_bigSNP.R
+++ b/tests/testthat/test_subset_bigSNP.R
@@ -27,8 +27,35 @@ test_that("subset_bigSNP correctly subsets the data", {
   expect_true(pop_a_snp$map %>% dplyr::filter(marker.ID==swapped_locus1) %>% pull(allele1)==
     new_snp$map %>% dplyr::filter(marker.ID==swapped_locus1) %>% pull(allele2))
 
+  ##############################################################################
+  ## check missing data
+  # add some missing data
+  pop_a_snp$genotypes[1,5]<-3
+  # check that this was successful
+  expect_true(is.na(pop_a_snp$genotypes[1,5]))
+  # now do the subset with swap
+  new_snp2 <- subset_bigSNP(pop_a_snp, indiv_indices = indiv_indices,
+                           loci_indices = loci_indices,
+                           swap_indices = swap_indices)
+  # check that we preserved the NA
+  expect_true(is.na(new_snp2$genotypes[2,5]))
+  # this should be identical to the original subset, except for the na value
+  new_geno_copy <- new_snp$genotypes[]
+  new_geno_copy[2,5]<-NA
+  expect_true(all.equal(new_snp2$genotypes[], new_geno_copy))
+  # now impute
+  pop_a_snp$genotypes <- bigsnpr::snp_fastImputeSimple(pop_a_snp$genotypes)
+  # check that it worked
+  expect_false(is.na(pop_a_snp$genotypes[1,5]))
+  # now subset like before
+  new_snp3 <- subset_bigSNP(pop_a_snp, indiv_indices = indiv_indices,
+                            loci_indices = loci_indices,
+                            swap_indices = swap_indices)
+  # check that we don't have an NA
+  expect_false(is.na(new_snp3$genotypes[2,5]))
+  # and that we flipped it correctly (it should be identical to the original subset)
+  expect_true(all.equal(new_snp3$genotypes[,5],new_snp$genotypes[,5]))
+  # but if we change the code, it is an imputed genotype
+  new_snp3$genotypes$code256<-bigsnpr::CODE_012
+  expect_true(is.na(new_snp3$genotypes[2,5]))
 })
-
-
-
-