Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Documentation #63

Merged
merged 8 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9019
Version: 0.0.0.9020
Authors@R:
c(person("Evie", "Carter", role = c("aut")),
person("Andrea", "Manica", email = "[email protected]",
Expand Down
1 change: 0 additions & 1 deletion R/filter_high_relatedness.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ filter_high_relatedness <-
# for pairs with higher than kings_threshold relatedness
# the individual with higher average relatedness is removed
for (i in 1:(var_num - 1)) {
#browser()
if (!any(matrix2[!is.na(matrix2)] > kings_threshold)) {
if (verbose) {
message("All correlations <=", kings_threshold, "\n")
Expand Down
1 change: 0 additions & 1 deletion R/gen_tibble_packedancestry.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ gen_tibble_packedancestry <- function(x, ...,
inds = indiv_table$id[i],
..., verbose = !quiet)
res$geno[is.na(res$geno)]<-3
#browser()
# now insert the genotypes in the FBM
file_backed_matrix[
i,
Expand Down
4 changes: 4 additions & 0 deletions R/gt_pca.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,9 @@
#' NOTE: using gt_pca_autoSVD with a small dataset will likely cause an error,
#' see man page for details.
#'
#' NOTE: monomorphic markers must be removed before PCA is computed. The error
#' message 'Error: some variables have zero scaling; remove them before attempting
#' to scale.' indicates that monomorphic markers are present.
#'
#' @name gt_pca
NULL
13 changes: 11 additions & 2 deletions R/gt_pca_autoSVD.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
#'
#' Note: rather than accessing these elements directly, it is better to use
#' `tidy` and `augment`. See [`gt_pca_tidiers`].
#' Note: If you encounter 'Error in rollmean(): Parameter 'size' is too large.'
#' roll_size is exceeding the number of variants on at least one of your chromosomes.
#' If you have pre-specified roll_size, you will need to reduce this parameter.
#' If not, try specifying a reduced 'roll_size' to avoid this error.
#'
#' @export

Expand All @@ -72,7 +76,13 @@
gt_set_imputed(x, set = TRUE)
on.exit(gt_set_imputed(x, set = FALSE))
}
#browser()

if(is.null(roll_size)){
message("If you encounter 'Error in rollmean(): Parameter 'size' is too large.'
roll_size exceeds the number of variants on at least one of your chromosomes.
Try reducing 'roll_size' to avoid this error.")

Check warning on line 83 in R/gt_pca_autoSVD.R

View check run for this annotation

Codecov / codecov/patch

R/gt_pca_autoSVD.R#L81-L83

Added lines #L81 - L83 were not covered by tests
}

X <- attr(x$genotypes,"bigsnp") # convenient pointer
x_ind_col <- show_loci(x)$big_index
x_ind_row <- vctrs::vec_data(x$genotypes)
Expand Down Expand Up @@ -110,7 +120,6 @@
verbose = verbose)
# add names to the scores (to match them to data later)
rownames(this_svd$u)<-x$id
#browser()
this_svd$method <- "autoSVD"
this_svd$call <- match.call()
# subset the loci table to have only the snps of interest
Expand Down
3 changes: 2 additions & 1 deletion R/pairwise_king.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
#'
#' This function computes the KING-robust estimator of kinship.
#'
#' Note that monomorphic sites are currently considered. What does PLINK do???
#' Note that monomorphic sites are currently considered. Remove monomorphic sites
#' before running pairwise_king if this is a concern.
#' @param x a `gen_tibble` object.
#' @param as_matrix boolean, determining whether the results should be a square symmetrical matrix (TRUE),
#' or a tidied tibble (FALSE, the default)
Expand Down
4 changes: 0 additions & 4 deletions R/rbind_dry_run.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE
# rename the alleles
ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
#browser()
rbind_dry_run_df(ref_df = ref_df,
target_df = target_df,
flip_strand = flip_strand,
Expand Down Expand Up @@ -115,8 +114,6 @@ rbind_dry_run_df <- function(ref_df, target_df, flip_strand, quiet){
to_flip <- to_flip & !ambiguous_sub
to_swap <- to_swap & !ambiguous_sub
}
# browser()

# now we create the two reporting data.frames
# note that they include all loci (including the ones we dropped because they
# did not exist in one of the datasets)
Expand Down Expand Up @@ -153,7 +150,6 @@ rbind_dry_run_df <- function(ref_df, target_df, flip_strand, quiet){


resolve_missing_alleles <- function(missing_table, other_table){
#browser()
missing_replacements <- rep(NA, nrow(missing_table))
for (i_row in which(missing_table$allele_1=="0")){
# get non-missing allele
Expand Down
1 change: 0 additions & 1 deletion R/rbind_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,6 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position
vctrs::vec_data(ref$genotypes)
#and finally append the loci table
indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes))
#browser()
#new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference

Expand Down
4 changes: 4 additions & 0 deletions man/gt_pca.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions man/gt_pca_autoSVD.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion man/pairwise_king.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -731,9 +731,6 @@ test_that("gen_tibble family.ID from vcf",{
expect_true(all(fam$V1 == pop_b_vcf_gt$id))
expect_true(all(fam$V1 == fam$V2))
})



# Windows prevents the deletion of the backing file. It's something to do with the memory mapping
# library used by bigsnpr
# test_that("on error, we remove the old files",{
Expand Down
81 changes: 43 additions & 38 deletions tests/testthat/test_gt_pca.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,16 @@ test_that("fit_gt_pca_and_predict",{

})

# TODO we should test gt_pca_autoSVD(), as the loci have to be subset within
# the object

test_that("gt_pca_autoSVD problem ",{
test_that("adjusting roll_size fixes gt_pca_autoSVD problem ",{

# @TODO remove this hack when the patch makes it through to cran
library(bigsnpr)


# Generate example_indiv_meta data frame
individual_ids <- paste0("indiv", 1:200)
populations <- sample(c("pop1", "pop2", "pop3", "pop4"), size = 200, replace = TRUE)
example_indiv_meta <- data.frame(id = individual_ids, population = populations)

# Generate example_genotypes matrix
# Generate example_genotypes matrix (no missingness)
values <- c(0, 1, 2)
genotype_matrix <- sample(values, size = 200 * 500, replace = TRUE)
example_genotypes <- matrix(genotype_matrix, nrow = 200, ncol = 500)
Expand All @@ -65,55 +60,43 @@ test_that("gt_pca_autoSVD problem ",{
allele_alt = allele_alts
)


# create gen_tibble
test <- gen_tibble(x = example_genotypes, loci = example_loci, indiv_meta = example_indiv_meta, quiet = TRUE)

#works fine
#test_pca <- test %>% gt_pca_autoSVD()



# gen_tibble with no missingness runs
test_pca <- test %>% gt_pca_autoSVD()

# Now try with imputed data
library(bigsnpr)
bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr")
missing_gt <- gen_tibble(bed_file, backingfile = tempfile("missing_"),quiet=TRUE)
expect_error(missing_gt %>% gt_pca_partialSVD(),
expect_error(missing_gt %>% gt_pca_autoSVD(),
"You can't have missing")
missing_gt <- gt_impute_simple(missing_gt, method = "mode")
expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")

#Issue for bigutilsr below:
#https://github.com/privefl/bigutilsr/issues/2

#Error in bigutilsr::rollmean(S.col[ind], roll.size) :
#Parameter 'size' is too large.

#the error persists when manually adjusting roll_size

expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(roll_size = 10, verbose = FALSE), "Parameter 'size' is too large.")
expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(roll_size = 1000, verbose = FALSE), "Parameter 'size' is too large.")

#until adjusting roll_size to 0
#missing_gt %>% gt_pca_autoSVD(roll_size = 0)
#or strangely up to 7?
#missing_gt %>% gt_pca_autoSVD(roll_size = 7)

# we encounter roll_size error
expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")

# adjusting roll_size fixes the error
test_pca_roll0 <- missing_gt %>% gt_pca_autoSVD(roll_size = 0)
expect_s3_class(test_pca_roll0, "gt_pca")
test_pca_roll7 <- missing_gt %>% gt_pca_autoSVD(roll_size = 7)
expect_s3_class(test_pca_roll7, "gt_pca")

#loci info is the same
#summary(test %>% show_loci())
#summary(missing_gt %>% show_loci())

#testing with the families dataset too
bed_file <- system.file("extdata/related", "families.bed", package = "tidypopgen")
missing_gt <- gen_tibble(bed_file, backingfile = tempfile("families"),quiet=TRUE, valid_alleles = c("2","1"))
expect_error( missing_gt %>% gt_pca_partialSVD(),
families <- gen_tibble(bed_file, backingfile = tempfile("families"),quiet=TRUE, valid_alleles = c("2","1"))
expect_error( families %>% gt_pca_autoSVD(),
"You can't have missing")
missing_gt <- gt_impute_simple(missing_gt, method = "mode")
families <- gt_impute_simple(families, method = "mode")

# the same error occurs
expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")
expect_error(missing_pca <- families %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.")

# adjusting roll_size fixes the error
test_pca_families_roll7 <- families %>% gt_pca_autoSVD(roll_size = 7)
expect_s3_class(test_pca_families_roll7, "gt_pca")
})


Expand All @@ -138,3 +121,25 @@ test_that("fit_gt_pca_and_predict_splitted_data",{
lsq_pred <- predict(modern_pca, new_data = ancient_gt, project_method = "least_squares")
expect_true(all(dim(lsq_pred)==c(20,2)))
})

test_that("PCA functions work with loci out of order",{
bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr")
missing_gt <- gen_tibble(bed_file, backingfile = tempfile("missing_"),quiet=TRUE)
missing_gt <- gt_impute_simple(missing_gt, method = "mode")
missing_part_pca1 <- missing_gt %>% gt_pca_partialSVD()
missing_rand_pca1 <- missing_gt %>% gt_pca_randomSVD()

# now shuffle the loci
loci <- missing_gt %>% show_loci()
loci <- loci[sample(nrow(loci)),]
show_loci(missing_gt) <- loci

# Rerun PCA
missing_part_pca2 <- missing_gt %>% gt_pca_partialSVD()
missing_rand_pca2 <- missing_gt %>% gt_pca_randomSVD()

expect_equal(missing_part_pca1$loadings, missing_part_pca2$loadings)
expect_equal(missing_rand_pca1$loadings, missing_rand_pca2$loadings)
})


Loading
Loading