diff --git a/R/filter_high_relatedness.R b/R/filter_high_relatedness.R index f91a2893..b3825f70 100644 --- a/R/filter_high_relatedness.R +++ b/R/filter_high_relatedness.R @@ -4,7 +4,6 @@ #' coefficients and returns the maximum set of individuals that contains no #' relationships above the given threshold. #' -#' TODO this function needs a test #' #' @param matrix a square symmetric matrix of individuals containing relationship coefficients #' @param .x a [`gen_tibble`] object diff --git a/R/gen_tibble.R b/R/gen_tibble.R index cfff22a2..90e07577 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -196,7 +196,12 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ..., stop ("'0' can not be a valid allele (it is the default missing allele value!)") } - # TODO check object types + if (!inherits(loci, "data.frame") || inherits(x, "tbl")){ + stop("loci must be one of data.frame or tbl") + } + if (!inherits(indiv_meta, "data.frame") || inherits(x, "tbl") || is.list(x)){ + stop("indiv_meta must be one of data.frame, tbl, or list") + } if (!all(c("id", "population") %in% names(indiv_meta))){ stop("ind_meta does not include the compulsory columns 'id' and 'population") } diff --git a/R/gt_has_imputed.R b/R/gt_has_imputed.R index bfb45d97..d4b98bf4 100644 --- a/R/gt_has_imputed.R +++ b/R/gt_has_imputed.R @@ -28,7 +28,7 @@ gt_uses_imputed <- function (x){ x <- x$genotypes } if (!gt_has_imputed(x)){ - stop("this dataset does not have any imputated values to use!") + stop("this dataset does not have any imputed values to use!") } if (identical(attr(x,"bigsnp")$genotypes$code256, bigsnpr::CODE_IMPUTE_PRED) | identical(attr(x,"bigsnp")$genotypes$code256, bigsnpr::CODE_DOSAGE)){ diff --git a/R/gt_impute_simple.R b/R/gt_impute_simple.R index 097d3577..1fddd504 100644 --- a/R/gt_impute_simple.R +++ b/R/gt_impute_simple.R @@ -7,7 +7,7 @@ #' #' @param x a [gen_tibble] with missing data #' @param method one of -#' - 'median': the most frequent genotype +#' - 'mode': the most frequent genotype #' - 'mean0': the mean rounded to the nearest integer #' - 'mean2': the mean rounded to 2 decimal places #' - 'random': randomly sample a genotype based on the observed allele frequencies diff --git a/R/gt_pca_tidiers.R b/R/gt_pca_tidiers.R index c9757217..6430c70b 100644 --- a/R/gt_pca_tidiers.R +++ b/R/gt_pca_tidiers.R @@ -172,7 +172,7 @@ augment_loci.gt_pca <- function(x, data = NULL, k= NULL, ...) { ret <- if (!missing(data) && !is.null(data)) { #check that names of the two columns are in sync # @TODO reinstate this check once we have rownames in the pca object for loadings - if (!all.equal(loci_names(data), rownames(as.data.frame(x$v)))){ + if (!identical(loci_names(data), rownames(as.data.frame(x$v)))){ stop("the loci names in 'data' do not correspond to the loci in the pca object 'x'") } show_loci(data) <- show_loci(data) %>% tibble::add_column(loadings) diff --git a/R/loci_ld_clump.R b/R/loci_ld_clump.R index 822c528d..658ac855 100644 --- a/R/loci_ld_clump.R +++ b/R/loci_ld_clump.R @@ -63,13 +63,18 @@ loci_ld_clump.vctrs_bigSNP <- function(.x, ...) { rlang::check_dots_empty() - + stopifnot_diploid(.x) + if (gt_has_imputed(.x) && gt_uses_imputed(.x)==FALSE){ #but not uses_imputed gt_set_imputed(.x, set = TRUE) on.exit(gt_set_imputed(.x, set = FALSE)) } - stopifnot_diploid(.x) + if(!identical(show_loci(.x),.x %>% show_loci() %>% arrange(show_loci(.x)$chromosome,show_loci(.x)$position))){ + stop("Your loci are not sorted, try using: show_loci(.data) <- .data %>% show_loci() %>% arrange(chromosome,position)") + + } + # get the FBM geno_fbm <- attr(.x,"bigsnp")$genotypes # rows (individuals) that we want to use diff --git a/R/loci_transitions.R b/R/loci_transitions.R index 80d9d9d8..df87bf2d 100644 --- a/R/loci_transitions.R +++ b/R/loci_transitions.R @@ -18,6 +18,7 @@ loci_transitions <- function(.x, ...) { loci_transitions.tbl_df <- function(.x, ...) { #TODO this is a hack to deal with the class being dropped when going through group_map stopifnot_gen_tibble(.x) + check_allele_alphabet(.x$genotypes) loci_transitions(.x$genotypes, ...) } diff --git a/R/loci_transversions.R b/R/loci_transversions.R index 36472c8d..000adbd0 100644 --- a/R/loci_transversions.R +++ b/R/loci_transversions.R @@ -18,6 +18,7 @@ loci_transversions <- function(.x, ...) { loci_transversions.tbl_df <- function(.x, ...) { #TODO this is a hack to deal with the class being dropped when going through group_map stopifnot_gen_tibble(.x) + check_allele_alphabet(.x$genotypes) loci_transversions(.x$genotypes, ...) } diff --git a/R/qc_report_indiv.R b/R/qc_report_indiv.R index 66f37ea5..8440bc34 100644 --- a/R/qc_report_indiv.R +++ b/R/qc_report_indiv.R @@ -46,7 +46,7 @@ qc_report_indiv <- function(.x, kings_threshold = NULL, ...){ #' ready plots. #' #' @param object an object of class `qc_report_indiv` -#' @param type the type of plot (`scatter`) +#' @param type the type of plot (`scatter`,`relatedness`) #' @param miss_threshold a threshold for the accepted rate of missingness within #' individuals #' @param kings_threshold an optional numeric, a threshold of relatedness for the sample @@ -96,7 +96,6 @@ autoplot_qc_report_indiv <- function(object, miss_threshold = miss_threshold){ autoplot_qc_report_indiv_king <- function(object, kings_threshold = kings_threshold){ - browser() king <- as.data.frame(attr(object$to_keep, "king")) num_samples <- nrow(king) king$row <- colnames(king) diff --git a/R/select_loci_if.R b/R/select_loci_if.R index bf99873b..e24b5e53 100644 --- a/R/select_loci_if.R +++ b/R/select_loci_if.R @@ -20,7 +20,7 @@ select_loci_if <-function(.data, .sel_logical){ # and now evaluate it, allowing it to see the data loci_sel <- rlang::eval_tidy(sel_defused,data=.data) if (!inherits(loci_sel,"logical")){ - stop(".sel_logical should be a logical (boolean) vector") + stop(".sel_logical should be a logical boolean vector") } if (length(loci_sel) != ncol(show_genotypes(.data$genotypes))){ stop(".sel_logical should be the same length as the number of loci") diff --git a/inst/extdata/pop_b.ped b/inst/extdata/pop_b.ped new file mode 100644 index 00000000..71093746 --- /dev/null +++ b/inst/extdata/pop_b.ped @@ -0,0 +1,3 @@ +pop_b SA073 0 0 2 -9 A A G G A A A A G G C C G G G G T T G G G G C C C C A A C T A G A A +pop_b SA1021 0 0 2 -9 G A A G A A G A G G C C T G A G 0 0 G G G G C C C C G A T T G G A A +pop_b SA1008 0 0 1 -9 A A A G A A G A G G C C T G G G T T G G G G C C C C A A C T G G A A diff --git a/inst/extdata/related/families_hwe.hwe b/inst/extdata/related/families_hwe.hwe new file mode 100644 index 00000000..85ff2289 --- /dev/null +++ b/inst/extdata/related/families_hwe.hwe @@ -0,0 +1,11 @@ + CHR SNP TEST A1 A2 GENO O(HET) E(HET) P + 1 1 ALL(NP) 2 1 0/4/6 0.4 0.32 1 + 1 2 ALL(NP) 2 1 2/4/6 0.3333 0.4444 0.5176 + 1 3 ALL(NP) 2 1 0/5/7 0.4167 0.3299 1 + 1 4 ALL(NP) 1 2 2/4/5 0.3636 0.4628 0.5377 + 1 5 ALL(NP) 2 1 0/7/5 0.5833 0.4132 0.4874 + 1 6 ALL(NP) 2 1 1/6/3 0.6 0.48 1 + 1 7 ALL(NP) 2 1 1/5/5 0.4545 0.4339 1 + 1 8 ALL(NP) 2 1 0/8/4 0.6667 0.4444 0.216 + 1 9 ALL(NP) 2 1 0/6/5 0.5455 0.3967 0.5046 + 1 10 ALL(NP) 1 2 2/7/3 0.5833 0.4965 1 diff --git a/inst/extdata/related/families_hwe.nosex b/inst/extdata/related/families_hwe.nosex new file mode 100644 index 00000000..22ee7fca --- /dev/null +++ b/inst/extdata/related/families_hwe.nosex @@ -0,0 +1,12 @@ +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +10 10 +11 11 +12 12 diff --git a/inst/extdata/related/families_hwe_midp.hwe b/inst/extdata/related/families_hwe_midp.hwe new file mode 100644 index 00000000..fa72488f --- /dev/null +++ b/inst/extdata/related/families_hwe_midp.hwe @@ -0,0 +1,11 @@ + CHR SNP TEST A1 A2 GENO O(HET) E(HET) P + 1 1 ALL(NP) 2 1 0/4/6 0.4 0.32 0.6533 + 1 2 ALL(NP) 2 1 2/4/6 0.3333 0.4444 0.3668 + 1 3 ALL(NP) 2 1 0/5/7 0.4167 0.3299 0.7019 + 1 4 ALL(NP) 1 2 2/4/5 0.3636 0.4628 0.3643 + 1 5 ALL(NP) 2 1 0/7/5 0.5833 0.4132 0.341 + 1 6 ALL(NP) 2 1 1/6/3 0.6 0.48 0.7866 + 1 7 ALL(NP) 2 1 1/5/5 0.4545 0.4339 0.7399 + 1 8 ALL(NP) 2 1 0/8/4 0.6667 0.4444 0.1299 + 1 9 ALL(NP) 2 1 0/6/5 0.5455 0.3967 0.3065 + 1 10 ALL(NP) 1 2 2/7/3 0.5833 0.4965 0.7969 diff --git a/inst/extdata/related/families_hwe_midp.nosex b/inst/extdata/related/families_hwe_midp.nosex new file mode 100644 index 00000000..22ee7fca --- /dev/null +++ b/inst/extdata/related/families_hwe_midp.nosex @@ -0,0 +1,12 @@ +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +10 10 +11 11 +12 12 diff --git a/man/autoplot.qc_report_indiv.Rd b/man/autoplot.qc_report_indiv.Rd index e87735e8..47bf6a9e 100644 --- a/man/autoplot.qc_report_indiv.Rd +++ b/man/autoplot.qc_report_indiv.Rd @@ -15,7 +15,7 @@ \arguments{ \item{object}{an object of class \code{qc_report_indiv}} -\item{type}{the type of plot (\code{scatter})} +\item{type}{the type of plot (\code{scatter},\code{relatedness})} \item{miss_threshold}{a threshold for the accepted rate of missingness within individuals} diff --git a/man/filter_high_relatedness.Rd b/man/filter_high_relatedness.Rd index c9e29afd..a9ee8121 100644 --- a/man/filter_high_relatedness.Rd +++ b/man/filter_high_relatedness.Rd @@ -30,6 +30,3 @@ This function takes a matrix of x by y individuals containing relatedness coefficients and returns the maximum set of individuals that contains no relationships above the given threshold. } -\details{ -TODO this function needs a test -} diff --git a/man/gt_impute_simple.Rd b/man/gt_impute_simple.Rd index 832d5687..fba31f5d 100644 --- a/man/gt_impute_simple.Rd +++ b/man/gt_impute_simple.Rd @@ -15,7 +15,7 @@ gt_impute_simple( \item{method}{one of \itemize{ -\item 'median': the most frequent genotype +\item 'mode': the most frequent genotype \item 'mean0': the mean rounded to the nearest integer \item 'mean2': the mean rounded to 2 decimal places \item 'random': randomly sample a genotype based on the observed allele frequencies diff --git a/tests/testthat/test_augment_loci.R b/tests/testthat/test_augment_loci.R new file mode 100644 index 00000000..297672f0 --- /dev/null +++ b/tests/testthat/test_augment_loci.R @@ -0,0 +1,16 @@ +test_that("augment_loci adds to loci_table",{ + bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr") + missing_gt <- gen_tibble(bed_file, backingfile = tempfile("missing_"),quiet=TRUE) + missing_gt <- gt_impute_simple(missing_gt) + missing_pca <- missing_gt %>% gt_pca_partialSVD() + + #Add loadings to loci table + missing_pca_load <- augment_loci(missing_pca, data = missing_gt) + expect_true(all(colnames(missing_pca_load) == c(colnames(show_loci(missing_gt)),paste0(".loadingPC", c(1:10))))) + + #Try assigning loadings to object with fewer loci + missing_gt_rm <- missing_gt %>% select_loci(c(1:400)) + expect_error(augment_loci(missing_pca, data = missing_gt_rm),"the loci names in 'data' do not correspond to the loci in the pca object ") + + +}) diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index a0d3750a..2ce54980 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -32,16 +32,19 @@ test_that("create gen_tibble from dfs",{ }) -test_genotypes_c <- rbind(c("1","1","0","1","1","0"), - c("2","1","0","0","0","0"), - c("2","2","0","0","1","1")) - -test_that("gen_tibble does not accept character matrix",{ - expect_error(test_dfs_gt <- gen_tibble(test_genotypes_c, indiv_meta = test_indiv_meta, - loci = test_loci, quiet = TRUE),"'x' is not a matrix of integers") +# now create it directly from the dfs +test_that("create gen_tibble from dfs",{ + test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta, + loci = test_loci, quiet = TRUE) + # because of the different backing file info, we cannot use identical on the whole object + expect_true(identical(show_genotypes(test_gt), show_genotypes(test_dfs_gt))) + expect_true(identical(show_loci(test_gt), show_loci(test_dfs_gt))) + expect_true(identical(test_gt %>% select(-genotypes), + test_dfs_gt %>% select(-genotypes))) }) + test_that("gen_tibble catches invalid alleles",{ test_loci_wrong <- test_loci test_loci_wrong$allele_alt[1] <- "N" @@ -70,6 +73,92 @@ test_that("gen_tibble catches invalid alleles",{ }) + +test_that("if order of loci is changed, order of genotypes also changes",{ + + pop_b <- gen_tibble(system.file("extdata/pop_b.bed", package="tidypopgen"),backingfile = tempfile(), quiet = TRUE) + #original genotypes + pop_b_gen <- show_genotypes(pop_b) + + #now scramble the loci + set.seed(123) + random_order <- sample(1:17) + show_loci(pop_b) <- pop_b %>% select_loci(all_of(random_order)) %>% show_loci() + + #reorder the original genotypes according to 'random_order' + pop_b_gen_reordered <- pop_b_gen[,random_order] + + #check that genotypes are now reordered according to random order + expect_equal(pop_b_gen_reordered, show_genotypes(pop_b)) + + +}) + +test_that("gen_tibble does not accept character matrix",{ + test_genotypes_c <- rbind(c("1","1","0","1","1","0"), + c("2","1","0","0","0","0"), + c("2","2","0","0","1","1")) + expect_error(test_dfs_gt <- gen_tibble(test_genotypes_c, indiv_meta = test_indiv_meta, + loci = test_loci, quiet = TRUE),"'x' is not a matrix of integers") +}) + +test_that("gen_tibble wrong filetype error",{ + expect_error(test_dfs_gt <- gen_tibble(system.file("extdata/related/test_king.kin0", package = "tidypopgen")), + "file_path should be pointing ") +}) + +test_that("gen_tibble loci is dataframe or tbl",{ + + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + wrong_loci_matrix <- as.matrix(test_loci) + + expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta, + loci = wrong_loci_matrix, quiet = TRUE),"loci must be one of data.frame or tbl") +}) + +test_that("gen_tibble required id and population",{ + wrong_indiv_meta <- data.frame (x =c("a","b","c"), + y = c("pop1","pop1","pop2")) + expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = wrong_indiv_meta, + loci = test_loci, quiet = TRUE),"ind_meta does not include the compulsory columns") +}) + +test_that("gen_tibble indiv_meta is list, dataframe, or tbl",{ + wrong_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + wrong_indiv_meta_matrix <- as.matrix(wrong_indiv_meta) + + expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = wrong_indiv_meta_matrix, + loci = test_loci, quiet = TRUE),"indiv_meta must be one of data.frame, tbl, or list") +}) + +test_that("gen_tibble identifies wrong dimensions in genotypes",{ + wrong_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0)) + expect_error(test_dfs_gt <- gen_tibble(wrong_genotypes, indiv_meta = test_indiv_meta, + loci = test_loci, quiet = TRUE), + "there is a mismatch between the number of loci in the genotype table x and in the loci table") + +}) + +test_that("gen_tibble identifies wrong loci table columns",{ + wrong_loci <- data.frame(a=paste0("rs",1:6), + b=paste0("chr",c(1,1,1,1,2,2)), + c=as.integer(c(3,5,65,343,23,456)), + d = as.integer(rep(0,6)), + e = c("A","T","C","G","C","T"), + f = c("T","C", NA,"C","G","A")) + expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta, + loci = wrong_loci, quiet = TRUE), + "loci does not include the compulsory columns") +}) + + test_that("gen_tibble from files",{ bed_path <- system.file("extdata/pop_a.bed", package = "tidypopgen") pop_a_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile()) diff --git a/tests/testthat/test_gt_impute_simple.R b/tests/testthat/test_gt_impute_simple.R index 65899882..ec469829 100644 --- a/tests/testthat/test_gt_impute_simple.R +++ b/tests/testthat/test_gt_impute_simple.R @@ -6,7 +6,7 @@ test_that("impute and use the imputation",{ "You can't have missing values") expect_false(gt_has_imputed(missing_gt)) expect_error(gt_uses_imputed(missing_gt), - "this dataset does not have any imputated") + "this dataset does not have any imputed") # now impute missing_gt <- gt_impute_simple(missing_gt) # we have imputed @@ -19,3 +19,98 @@ test_that("impute and use the imputation",{ expect_error(gt_set_imputed(missing_gt), "set should be either") }) + + +test_that("gt_impute imputes properly",{ + test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f"), + population = c("pop1","pop1","pop2","pop1","pop1","pop2")) + + test_genotypes <- matrix(c( + 0, 2, 1, 1, 0, # + 0, 0, 2, 0, 1, # + 2, 0, 0, 1, 1, # + 1, 1, 2, 2, 2, # + 0, 0, 2, 1, 1, # + NA,NA,NA,NA,NA + ), nrow = 6, byrow = TRUE) + + + + test_loci <- data.frame(name=paste0("rs",1:5), + chromosome=c(1,1,1,2,2), + position=c(3,65,343,23,456), + genetic_dist = as.integer(rep(0,5)), + allele_ref = c("A","T","C","G","C"), + allele_alt = c("T","C", NA,"C","G")) + + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + + #test errors on non-imputed set + expect_error(gt_uses_imputed(test_gt),"this dataset does not have any imputed values") + expect_error(gt_set_imputed(test_gt, TRUE),"this dataset does not have imputed values") + + + #impute method = 'mode' + imputed_gt_mode <- gt_impute_simple(test_gt, method = "mode") + + #check imputation + expect_false(gt_has_imputed(test_gt)) + expect_true(gt_has_imputed(imputed_gt_mode)) + + #set imputation + gt_set_imputed(imputed_gt_mode, TRUE) + expect_false(any(is.na(show_genotypes(imputed_gt_mode)))) + + #test error trying to impute an already imputed set + #expect_error(gt_impute_simple(imputed_gt_mode),"object x is already imputed") + + #Check imputed 'mode' method + mode_function <- function(x){ + unique_x <- unique(x) + return(unique_x[which.max(tabulate(match(x, unique_x)))]) + + } + + modes <- apply(test_genotypes, 2, mode_function) + expect_true(all(show_genotypes(imputed_gt_mode)[6,] == modes)) + + + #impute method = 'mean0' + imputed_gt_mean0 <- gt_impute_simple(test_gt, method = "mean0") + + #set imputation + gt_set_imputed(imputed_gt_mean0, TRUE) + expect_false(any(is.na(show_genotypes(imputed_gt_mean0)))) + + #check imputed 'mean0' method + means <- round(colMeans(test_genotypes, na.rm = TRUE), digit = 0) + expect_true(all(means == show_genotypes(imputed_gt_mean0)[6,])) + + + #impute method = 'random' + imputed_gt_random <- gt_impute_simple(test_gt, method = "random") + + #set imputation + gt_set_imputed(imputed_gt_random, TRUE) + expect_false(any(is.na(show_genotypes(imputed_gt_random)))) + + # Problem with method 'mean2'? + + + #impute method = 'mean2' + imputed_gt_mean2 <- gt_impute_simple(test_gt, method = "mean2") + + #set imputation + gt_set_imputed(imputed_gt_mean2, TRUE) + #expect_false(any(is.na(show_genotypes(imputed_gt_mean2)))) #? + + #check imputed 'mean2' method + means2 <- round(colMeans(test_genotypes, na.rm = TRUE), digit = 2) + #expect_true(all(means2 == show_genotypes(imputed_gt_mean2)[6,])) #? + + + +}) + + + diff --git a/tests/testthat/test_indiv_het_obs.R b/tests/testthat/test_indiv_het_obs.R index 1219f853..190ff05e 100644 --- a/tests/testthat/test_indiv_het_obs.R +++ b/tests/testthat/test_indiv_het_obs.R @@ -22,3 +22,30 @@ test_that("indiv_het_obs computes correctly",{ expect_true(all(indiv_het_obs(test_gt)== rowMeans(test_genotypes==1,na.rm=TRUE))) }) + +test_that("indiv_het_obs returns 0's when all genotypes are homozygous", { + + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes_homozyg <- rbind(c(2,2,0,0,2,0), + c(2,2,0,0,2,0), + c(2,2,0,0,2,0)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=c(1,1,1,1,2,2), + position=c(3,5,65,343,23,456), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + + test_gt_homozyg <- gen_tibble(x = test_genotypes_homozyg, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + + # feeding the list of SNPbin directly + expect_true(all(indiv_het_obs(test_gt_homozyg$genotypes)== + rowMeans(test_genotypes_homozyg==1,na.rm=TRUE))) + + # passing tibble + expect_true(all(indiv_het_obs(test_gt_homozyg)== + rowMeans(test_genotypes_homozyg==1,na.rm=TRUE))) + +}) + diff --git a/tests/testthat/test_indiv_missingness.R b/tests/testthat/test_indiv_missingness.R index f9b69a3c..249b4e27 100644 --- a/tests/testthat/test_indiv_missingness.R +++ b/tests/testthat/test_indiv_missingness.R @@ -41,4 +41,24 @@ test_that("as_counts switch",{ expect_false(identical(qc_rates,qc_counts)) }) +test_genotypes_moreNA <- rbind(c(1,1,0,1,1,2), + c(2,NA,NA,NA,NA,NA), + c(2,2,0,0,1,NA)) + +test_gt2 <- gen_tibble(x = test_genotypes_moreNA, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + + +test_that("behaviour of indiv_missingness",{ + + #create a subset gt + test_gt2_subset <- test_gt2 %>% select_loci(c(2,3,4,5,6)) + test_genotypes_moreNA_subset <- test_genotypes_moreNA[,c(2:6)] + + sum_na <- function(x){sum(is.na(x))} + # feeding the genotypes directly + expect_true(all(indiv_missingness(test_gt2_subset$genotypes)== + round(apply(test_genotypes_moreNA_subset, MARGIN = 1,sum_na)/ncol(test_genotypes_moreNA_subset), digits = 1))) + +}) + diff --git a/tests/testthat/test_loci_freq.R b/tests/testthat/test_loci_freq.R index 9a8ac109..a190e185 100644 --- a/tests/testthat/test_loci_freq.R +++ b/tests/testthat/test_loci_freq.R @@ -23,11 +23,46 @@ test_that("loci_alt_freq and loci_maf computes correctly",{ # repeat the tests for a subset of the data # remove the 2nd individual and the 3rd and 5th snp - test_genotypes <- test_genotypes[-2,c(-3,-5)] - test_gt <- test_gt %>% filter(id!="b") %>% select_loci(c(-3,-5)) - freq <- colSums(test_genotypes, na.rm=TRUE)/(c(2,2,2,1)*2) - expect_true(all(loci_alt_freq(test_gt$genotypes)==freq)) + test_genotypes_subset1 <- test_genotypes[-2,c(-3,-5)] + test_gt_subset1 <- test_gt %>% filter(id!="b") %>% select_loci(c(-3,-5)) + freq <- colSums(test_genotypes_subset1, na.rm=TRUE)/(c(2,2,2,1)*2) + expect_true(all(loci_alt_freq(test_gt_subset1$genotypes)==freq)) # convert to minor frequencies freq[freq>0.5] <- 1 - freq[freq>0.5] - expect_true(all(loci_maf(test_gt$genotypes)==freq)) + expect_true(all(loci_maf(test_gt_subset1$genotypes)==freq)) + + # repeat the tests for a subset where for loci 6, all genotypes are missing + # remove the 1st individual and the 3rd and 4th snp + test_genotypes_subset2 <- test_genotypes[-1,c(-3,-4)] + test_gt_subset2 <- test_gt %>% filter(id!="a") %>% select_loci(c(-3,-4)) + + #we expect NaN for loci 6 - as both genotypes are NA + freq <- colSums(test_genotypes_subset2, na.rm=TRUE)/(c(2,2,2,NA)*2) + expect_equal(loci_alt_freq(test_gt_subset2$genotypes),freq) + + # convert to minor frequencies + freq[freq>0.5 & !is.na(freq)] <- 1-freq[freq>0.5 & !is.na(freq)] + expect_equal(loci_maf(test_gt_subset2$genotypes),freq) + + #Test NA in centre + test_genotypes2 <- rbind(c(1,1,0,1,1,2), + c(2,1,0,NA,0,NA), + c(2,2,0,NA,1,NA)) + + test_gt2 <- gen_tibble(x = test_genotypes2, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + + # repeat the tests for a subset where for loci 6, all genotypes are missing + # remove the 1st individual + test_genotypes_subset3 <- test_genotypes2[-1,] + test_gt_subset3 <- test_gt2 %>% filter(id!="a") #%>% select_loci(c(-3,-4)) + + #we expect NaN for loci 4 and 6 - as both genotypes are NA + freq2 <- colSums(test_genotypes_subset3, na.rm=TRUE)/(c(2,2,2,NA,2,NA)*2) + expect_equal(loci_alt_freq(test_gt_subset3$genotypes),freq2) + + + # convert to minor frequencies + freq2[freq2>0.5 & !is.na(freq2)] <- 1-freq2[freq2>0.5 & !is.na(freq2)] + expect_equal(loci_maf(test_gt_subset3$genotypes),freq2) + }) diff --git a/tests/testthat/test_loci_hwe.R b/tests/testthat/test_loci_hwe.R new file mode 100644 index 00000000..e21b6951 --- /dev/null +++ b/tests/testthat/test_loci_hwe.R @@ -0,0 +1,45 @@ +test_that("loci_hwe produces the same output as plink --hardy midp",{ + + #Create gentibble for our data + bed_path <- system.file("extdata/related/families.bed", package = "tidypopgen") + families_bigsnp_path <- bigsnpr::snp_readBed(bed_path, backingfile = tempfile()) #bigsnpr::sub_bed(bed_path) + #families_bigsnp_path <- system.file("extdata/related/families.rds", package = "tidypopgen") + families <- gen_tibble(families_bigsnp_path, quiet = TRUE, valid_alleles = c("1","2")) + + #Read in plink results (first 10 snps) + plink_hwe <- read.table(system.file("extdata/related/families_hwe_midp.hwe", package = "tidypopgen"),header = TRUE) + + #Calculate hwe in tidypopgen + tidy_hwe <- families %>% select_loci(c(1:10)) %>% loci_hwe(mid_p = TRUE) + + #Compare + result <- all.equal(plink_hwe$P, tidy_hwe, tolerance = 0.0001) + + #Check results are the same to 4 decimals + expect_true(result) + +}) + + +test_that("loci_hwe mid_p = FALSE produces the same output as plink --hardy ",{ + + #Create gentibble for our data + bed_path <- system.file("extdata/related/families.bed", package = "tidypopgen") + families_bigsnp_path <- bigsnpr::snp_readBed(bed_path, backingfile = tempfile()) #bigsnpr::sub_bed(bed_path) + #families_bigsnp_path <- system.file("extdata/related/families.rds", package = "tidypopgen") + families <- gen_tibble(families_bigsnp_path, quiet = TRUE, valid_alleles = c("1","2")) + + #Read in plink results (first 10 snps) + plink_hwe <- read.table(system.file("extdata/related/families_hwe.hwe", package = "tidypopgen"),header = TRUE) + + #Calculate hwe in tidypopgen + tidy_hwe <- families %>% select_loci(c(1:10)) %>% loci_hwe(mid_p = FALSE) + + #Compare + result <- all.equal(plink_hwe$P, tidy_hwe, tolerance = 0.0001) + + #Check results are the same to 4 decimals + expect_true(result) + +}) + diff --git a/tests/testthat/test_loci_ld_clump.R b/tests/testthat/test_loci_ld_clump.R index 4d5e83f8..7fd27932 100644 --- a/tests/testthat/test_loci_ld_clump.R +++ b/tests/testthat/test_loci_ld_clump.R @@ -10,8 +10,9 @@ test_loci <- data.frame(name=paste0("rs",1:6), allele_ref = c("A","T","C","G","C","T"), allele_alt = c("T","C", NA,"C","G","A")) + test_gt <- gen_tibble(x = test_genotypes, indiv_meta = test_indiv_meta, loci = test_loci, - backingfile = tempfile()) + backingfile = tempfile(), quiet = TRUE) test_that("ld clumping runs",{ @@ -22,3 +23,71 @@ test_that("ld clumping runs",{ +test_that("loci_ld_clump returns the same as bigsnpr",{ + + bedfile <- system.file("extdata/related/families.bed", package="tidypopgen") + rds <- bigsnpr::snp_readBed(bedfile, backingfile = tempfile()) + + bigsnp <- bigsnpr::snp_attach(rds) + + gen_tbl <- gen_tibble(bedfile, quiet = TRUE, + backingfile = tempfile(), valid_alleles = c("1","2")) + + #also tests imputation + bigsnp_imputed <- bigsnpr::snp_fastImputeSimple(bigsnp$genotypes,method = "mode") + bigsnp$genotypes <- bigsnp_imputed + bigsnpr::snp_save(bigsnp) + bigsnp_imputed <- bigsnpr::snp_attach(rds) + + + gen_tbl_imputed <- gt_impute_simple(gen_tbl, method = "mode") + gt_set_imputed(gen_tbl_imputed, TRUE) + + #set up bigsnpr + G <- bigsnp_imputed$genotypes + POS <- bigsnp_imputed$map$physical.pos + CHR <- bigsnp_imputed$map$chromosome + + + ind.keep <- bigsnpr::snp_clumping(G, infos.chr = CHR, infos.pos = POS,thr.r2 = 0.2) + to_keep <- loci_ld_clump(gen_tbl_imputed, thr_r2 = 0.2) + + #convert our output to indices + ind <- which(to_keep == TRUE) + + #check they match + expect_true(all(ind.keep == ind)) + + +}) + + +test_that("loci_ld_clump error unsorted loci",{ + + pop_b <- gen_tibble(system.file("extdata/pop_b.bed", package="tidypopgen"),backingfile = tempfile(), quiet = TRUE) + + #now scramble the loci + set.seed(123) + random_order <- sample(1:17) + show_loci(pop_b) <- pop_b %>% select_loci(all_of(random_order)) %>% show_loci() + + #impute + pop_b_imputed <- gt_impute_simple(pop_b, method = "mode") + + #ld + expect_error(loci_ld_clump(pop_b_imputed, thr_r2 = 0.2), "Your loci are not sorted, try using:") + expect_false(identical(show_loci(pop_b_imputed), pop_b_imputed %>% show_loci() %>% arrange(chromosome,position))) + + #reorder the loci + show_loci(pop_b_imputed) <- pop_b_imputed %>% show_loci() %>% arrange(chromosome,position) + + #try again + expect_equal(loci_ld_clump(pop_b_imputed, thr_r2 = 0.2), + c(FALSE,TRUE,TRUE,FALSE,TRUE,TRUE,FALSE,FALSE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE)) + expect_true(identical(show_loci(pop_b_imputed), pop_b_imputed %>% show_loci() %>% arrange(chromosome,position))) + +}) + + + + diff --git a/tests/testthat/test_loci_missingness.R b/tests/testthat/test_loci_missingness.R index f17f07ea..c7dd631f 100644 --- a/tests/testthat/test_loci_missingness.R +++ b/tests/testthat/test_loci_missingness.R @@ -20,4 +20,17 @@ test_that("snpbin_list_means computes correctly",{ expect_true(all(loci_missingness(test_gt$genotypes, as_counts = TRUE)==n_na)) # convert to frequencies expect_true(all(loci_missingness(test_gt$genotypes)==n_na/nrow(test_genotypes))) + + #create a subset gt + test_gt_subset1 <- test_gt %>% filter(id!="a") + #subset genotypes + test_genotypes_subset1 <- test_genotypes[-1,] + + n_na <- apply(test_genotypes_subset1, 2, count_na) + expect_true(all(loci_missingness(test_gt_subset1$genotypes, as_counts = TRUE)==n_na)) + # convert to frequencies + expect_true(all(loci_missingness(test_gt_subset1$genotypes)==n_na/nrow(test_genotypes_subset1))) + + + }) diff --git a/tests/testthat/test_loci_transversions_transitions.R b/tests/testthat/test_loci_transversions_transitions.R index 5eedd551..f091f7f5 100644 --- a/tests/testthat/test_loci_transversions_transitions.R +++ b/tests/testthat/test_loci_transversions_transitions.R @@ -16,3 +16,27 @@ test_that("find transitions and transversions",{ expect_true(all.equal(loci_transversions(test_gt), transv_bool)) expect_true(all.equal(loci_transitions(test_gt), !transv_bool)) }) + +test_that("check warning message for different alleles",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,2), + c(2,1,0,NA,0,NA), + c(2,2,0,0,1,NA)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=c(1,1,1,1,2,2), + position=c(3,5,65,343,23,456), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("a","t","c","g","c","t"), + allele_alt = c("t","c", NA,"c","g","a")) + + #Create a gen_tibble with alleles that are not "A" "T" "C" "G" using valid_alleles + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, valid_alleles = c("a", "t", "c", "g")) + + testthat::expect_error(loci_transversions(test_gt), "valid alleles are A T C G 0 . but ") + testthat::expect_error(loci_transitions(test_gt), "valid alleles are A T C G 0 . but ") + +}) + + + diff --git a/tests/testthat/test_select_loci_if.R b/tests/testthat/test_select_loci_if.R index 503472cb..078a77a1 100644 --- a/tests/testthat/test_select_loci_if.R +++ b/tests/testthat/test_select_loci_if.R @@ -31,6 +31,14 @@ test_that("select_loci_if subsets correctly",{ criterion_na <- c(TRUE, TRUE, NA, TRUE, TRUE, FALSE) expect_identical(test_gen_sub, test_gt %>% select_loci_if(criterion_na)) - #TODO test errors + + #Test error warnings: boolean criteria too short + criterion_na <- c(TRUE, TRUE, NA, TRUE, TRUE) + expect_error(test_gt %>% select_loci_if(criterion_na), ".sel_logical should be the same length ") + + #Test error warnings: selection criteria not boolean + rsID <- c("rs1", "rs2", "rs3", "rs4", "x1", "x2") + expect_error(test_gt %>% select_loci_if(rsID), ".sel_logical should be a logical boolean vector") + }) diff --git a/vignettes/plink_cheatsheet.Rmd b/vignettes/plink_cheatsheet.Rmd index e58ae56f..0e328a3c 100644 --- a/vignettes/plink_cheatsheet.Rmd +++ b/vignettes/plink_cheatsheet.Rmd @@ -25,10 +25,10 @@ data <- gen_tibble(bigsnp_path, quiet = TRUE) ## File management and reading data: -| PLINK | `tidypopgen` | +| PLINK | `tidypopgen` | |-------------------------------|----------------------------------------------| -| --make-bed --out | `gt_as_plink(data, bedfile = my_file)` | -| --recode | (TODO? generate .ped and .raw again) | +| --make-bed --out | `gt_as_plink(data, bedfile = my_file)` | +| --recode | `gt_as_plink(data, bedfile = my_file | | --allele1234 and --alleleACGT | See `gen_tibble()` parameter 'valid_alleles' | PLINK flags --update-alleles, --allele1234, and --alleleACGT, all alter the coding of alleles. In `tidypopgen`, valid alleles are supplied when reading in a `gen_tibble`.