EvolEcolGroup · dramanica · Dec 2, 2024 · Nov 26, 2024 · Nov 27, 2024 · Nov 28, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidypopgen
 Title: Tidy Population Genetics
-Version: 0.0.0.9019
+Version: 0.0.0.9020
 Authors@R: 
     c(person("Evie", "Carter", role = c("aut")),
     person("Andrea", "Manica", email = "[email protected]", 

diff --git a/R/filter_high_relatedness.R b/R/filter_high_relatedness.R
@@ -72,7 +72,6 @@ filter_high_relatedness <-
     # for pairs with higher than kings_threshold relatedness
     # the individual with higher average relatedness is removed
     for (i in 1:(var_num - 1)) {
-      #browser()
       if (!any(matrix2[!is.na(matrix2)] > kings_threshold)) {
         if (verbose) {
           message("All correlations <=", kings_threshold, "\n")

diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R
@@ -55,7 +55,6 @@ gen_tibble_packedancestry <- function(x, ...,
                                               inds = indiv_table$id[i],
                                             ..., verbose = !quiet)
     res$geno[is.na(res$geno)]<-3
-    #browser()
     # now insert the genotypes in the FBM
     file_backed_matrix[
       i,

diff --git a/R/gt_pca.R b/R/gt_pca.R
@@ -14,5 +14,9 @@
 #' NOTE: using gt_pca_autoSVD with a small dataset will likely cause an error,
 #' see man page for details.
 #'
+#' NOTE: monomorphic markers must be removed before PCA is computed. The error
+#' message 'Error: some variables have zero scaling; remove them before attempting
+#' to scale.' indicates that monomorphic markers are present.
+#'
 #' @name gt_pca
 NULL
diff --git a/R/gt_pca_autoSVD.R b/R/gt_pca_autoSVD.R
@@ -50,6 +50,10 @@
 #'
 #' Note: rather than accessing these elements directly, it is better to use
 #' `tidy` and `augment`. See [`gt_pca_tidiers`].
+#' Note: If you encounter 'Error in rollmean(): Parameter 'size' is too large.'
+#' roll_size is exceeding the number of variants on at least one of your chromosomes.
+#' If you have pre-specified roll_size, you will need to reduce this parameter.
+#' If not, try specifying a reduced 'roll_size' to avoid this error.
 #'
 #' @export
 
@@ -72,7 +76,13 @@
     gt_set_imputed(x, set = TRUE)
     on.exit(gt_set_imputed(x, set = FALSE))
   }
-  #browser()
+
+  if(is.null(roll_size)){
+    message("If you encounter 'Error in rollmean(): Parameter 'size' is too large.'
+          roll_size exceeds the number of variants on at least one of your chromosomes.
+          Try reducing 'roll_size' to avoid this error.")
+  }
+
   X <- attr(x$genotypes,"bigsnp") # convenient pointer
   x_ind_col <- show_loci(x)$big_index
   x_ind_row <- vctrs::vec_data(x$genotypes)
@@ -110,7 +120,6 @@
                       verbose = verbose)
   # add names to the scores (to match them to data later)
   rownames(this_svd$u)<-x$id
-  #browser()
   this_svd$method <- "autoSVD"
   this_svd$call <- match.call()
   # subset the loci table to have only the snps of interest

diff --git a/R/pairwise_king.R b/R/pairwise_king.R
@@ -2,7 +2,8 @@
 #'
 #' This function computes the KING-robust estimator of kinship.
 #'
-#' Note that monomorphic sites are currently considered. What does PLINK do???
+#' Note that monomorphic sites are currently considered. Remove monomorphic sites
+#' before running pairwise_king if this is a concern.
 #' @param x a `gen_tibble` object.
 #' @param as_matrix boolean, determining whether the results should be a square symmetrical matrix (TRUE),
 #' or a tidied tibble (FALSE, the default)

diff --git a/R/rbind_dry_run.R b/R/rbind_dry_run.R
@@ -54,7 +54,6 @@ rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE
   # rename the alleles
   ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
   target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
-  #browser()
   rbind_dry_run_df(ref_df = ref_df,
                    target_df = target_df,
                    flip_strand = flip_strand,
@@ -115,8 +114,6 @@ rbind_dry_run_df <- function(ref_df, target_df,  flip_strand, quiet){
     to_flip <- to_flip & !ambiguous_sub
     to_swap <- to_swap & !ambiguous_sub
   }
- # browser()
-
   # now we create the two reporting data.frames
   # note that they include all loci (including the ones we dropped because they
   # did not exist in one of the datasets)
@@ -153,7 +150,6 @@ rbind_dry_run_df <- function(ref_df, target_df,  flip_strand, quiet){
 
 
 resolve_missing_alleles <- function(missing_table, other_table){
-  #browser()
   missing_replacements <- rep(NA, nrow(missing_table))
   for (i_row in which(missing_table$allele_1=="0")){
     # get non-missing allele

diff --git a/R/rbind_gen_tibble.R b/R/rbind_gen_tibble.R
@@ -154,7 +154,6 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position
   vctrs::vec_data(ref$genotypes)
   #and finally append the loci table
   indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes))
-  #browser()
   #new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
   new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference
 

diff --git a/man/gt_pca.Rd b/man/gt_pca.Rd
diff --git a/man/gt_pca_autoSVD.Rd b/man/gt_pca_autoSVD.Rd
diff --git a/man/pairwise_king.Rd b/man/pairwise_king.Rd
diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R
@@ -731,9 +731,6 @@ test_that("gen_tibble family.ID from vcf",{
   expect_true(all(fam$V1 == pop_b_vcf_gt$id))
   expect_true(all(fam$V1 == fam$V2))
 })
-
-
-
 # Windows prevents the deletion of the backing file. It's something to do with the memory mapping
 # library used by bigsnpr
 # test_that("on error, we remove the old files",{

diff --git a/tests/testthat/test_gt_pca.R b/tests/testthat/test_gt_pca.R
@@ -29,21 +29,16 @@ test_that("fit_gt_pca_and_predict",{
 
 })
 
-# TODO we should test gt_pca_autoSVD(), as the loci have to be subset within
-# the object
-
-test_that("gt_pca_autoSVD problem ",{
+test_that("adjusting roll_size fixes gt_pca_autoSVD problem ",{
 
   # @TODO remove this hack when the patch makes it through to cran
-  library(bigsnpr)
-
 
   # Generate example_indiv_meta data frame
   individual_ids <- paste0("indiv", 1:200)
   populations <- sample(c("pop1", "pop2", "pop3", "pop4"), size = 200, replace = TRUE)
   example_indiv_meta <- data.frame(id = individual_ids, population = populations)
 
-  # Generate example_genotypes matrix
+  # Generate example_genotypes matrix (no missingness)
   values <- c(0, 1, 2)
   genotype_matrix <- sample(values, size = 200 * 500, replace = TRUE)
   example_genotypes <- matrix(genotype_matrix, nrow = 200, ncol = 500)
@@ -65,55 +60,43 @@ test_that("gt_pca_autoSVD problem ",{
     allele_alt = allele_alts
   )
 
-
+  # create gen_tibble
   test <- gen_tibble(x = example_genotypes, loci = example_loci, indiv_meta = example_indiv_meta, quiet = TRUE)
 
-  #works fine
-  #test_pca <- test %>% gt_pca_autoSVD()
-
-
-
+  # gen_tibble with no missingness runs
+  test_pca <- test %>% gt_pca_autoSVD()
 
   # Now try with imputed data
+  library(bigsnpr)
   bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr")
   missing_gt <- gen_tibble(bed_file,  backingfile = tempfile("missing_"),quiet=TRUE)
-  expect_error(missing_gt %>% gt_pca_partialSVD(),
+  expect_error(missing_gt %>% gt_pca_autoSVD(),
                 "You can't have missing")
   missing_gt <- gt_impute_simple(missing_gt, method = "mode")
-  expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")
-
-  #Issue for bigutilsr below:
-  #https://github.com/privefl/bigutilsr/issues/2
-
-  #Error in bigutilsr::rollmean(S.col[ind], roll.size) :
-  #Parameter 'size' is too large.
-
-  #the error persists when manually adjusting roll_size
-
-  expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(roll_size = 10, verbose = FALSE), "Parameter 'size' is too large.")
-  expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(roll_size = 1000, verbose = FALSE), "Parameter 'size' is too large.")
-
-  #until adjusting roll_size to 0
-  #missing_gt %>% gt_pca_autoSVD(roll_size = 0)
-  #or strangely up to 7?
-  #missing_gt %>% gt_pca_autoSVD(roll_size = 7)
 
+  # we encounter roll_size error
+  expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")
 
+  # adjusting roll_size fixes the error
+  test_pca_roll0 <- missing_gt %>% gt_pca_autoSVD(roll_size = 0)
+  expect_s3_class(test_pca_roll0, "gt_pca")
+  test_pca_roll7 <- missing_gt %>% gt_pca_autoSVD(roll_size = 7)
+  expect_s3_class(test_pca_roll7, "gt_pca")
 
-  #loci info is the same
-  #summary(test %>% show_loci())
-  #summary(missing_gt %>% show_loci())
 
   #testing with the families dataset too
   bed_file <- system.file("extdata/related", "families.bed", package = "tidypopgen")
-  missing_gt <- gen_tibble(bed_file,  backingfile = tempfile("families"),quiet=TRUE, valid_alleles = c("2","1"))
-  expect_error( missing_gt %>% gt_pca_partialSVD(),
+  families <- gen_tibble(bed_file,  backingfile = tempfile("families"),quiet=TRUE, valid_alleles = c("2","1"))
+  expect_error( families %>% gt_pca_autoSVD(),
                 "You can't have missing")
-  missing_gt <- gt_impute_simple(missing_gt, method = "mode")
+  families <- gt_impute_simple(families, method = "mode")
 
   # the same error occurs
-  expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")
+  expect_error(missing_pca <- families %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")
 
+  # adjusting roll_size fixes the error
+  test_pca_families_roll7 <- families %>% gt_pca_autoSVD(roll_size = 7)
+  expect_s3_class(test_pca_families_roll7, "gt_pca")
 })
 
 
@@ -138,3 +121,25 @@ test_that("fit_gt_pca_and_predict_splitted_data",{
   lsq_pred <- predict(modern_pca, new_data = ancient_gt, project_method = "least_squares")
   expect_true(all(dim(lsq_pred)==c(20,2)))
 })
+
+test_that("PCA functions work with loci out of order",{
+  bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr")
+  missing_gt <- gen_tibble(bed_file,  backingfile = tempfile("missing_"),quiet=TRUE)
+  missing_gt <- gt_impute_simple(missing_gt, method = "mode")
+  missing_part_pca1 <- missing_gt %>% gt_pca_partialSVD()
+  missing_rand_pca1 <- missing_gt %>% gt_pca_randomSVD()
+
+  # now shuffle the loci
+  loci <- missing_gt %>% show_loci()
+  loci <- loci[sample(nrow(loci)),]
+  show_loci(missing_gt) <- loci
+
+  # Rerun PCA
+  missing_part_pca2 <- missing_gt %>% gt_pca_partialSVD()
+  missing_rand_pca2 <- missing_gt %>% gt_pca_randomSVD()
+
+  expect_equal(missing_part_pca1$loadings, missing_part_pca2$loadings)
+  expect_equal(missing_rand_pca1$loadings, missing_rand_pca2$loadings)
+})
+
+