diff --git a/DESCRIPTION b/DESCRIPTION index 99eaf9ff..de31fe18 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidypopgen Title: Tidy Population Genetics -Version: 0.0.0.9019 +Version: 0.0.0.9020 Authors@R: c(person("Evie", "Carter", role = c("aut")), person("Andrea", "Manica", email = "am315@cam.ac.uk", diff --git a/R/filter_high_relatedness.R b/R/filter_high_relatedness.R index 75aebb7d..dd9bf700 100644 --- a/R/filter_high_relatedness.R +++ b/R/filter_high_relatedness.R @@ -72,7 +72,6 @@ filter_high_relatedness <- # for pairs with higher than kings_threshold relatedness # the individual with higher average relatedness is removed for (i in 1:(var_num - 1)) { - #browser() if (!any(matrix2[!is.na(matrix2)] > kings_threshold)) { if (verbose) { message("All correlations <=", kings_threshold, "\n") diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R index 4e6801d6..4023567d 100644 --- a/R/gen_tibble_packedancestry.R +++ b/R/gen_tibble_packedancestry.R @@ -55,7 +55,6 @@ gen_tibble_packedancestry <- function(x, ..., inds = indiv_table$id[i], ..., verbose = !quiet) res$geno[is.na(res$geno)]<-3 - #browser() # now insert the genotypes in the FBM file_backed_matrix[ i, diff --git a/R/gt_pca.R b/R/gt_pca.R index a7e51155..a9417f01 100644 --- a/R/gt_pca.R +++ b/R/gt_pca.R @@ -14,5 +14,9 @@ #' NOTE: using gt_pca_autoSVD with a small dataset will likely cause an error, #' see man page for details. #' +#' NOTE: monomorphic markers must be removed before PCA is computed. The error +#' message 'Error: some variables have zero scaling; remove them before attempting +#' to scale.' indicates that monomorphic markers are present. +#' #' @name gt_pca NULL diff --git a/R/gt_pca_autoSVD.R b/R/gt_pca_autoSVD.R index fd505e1a..fe7fa4d0 100644 --- a/R/gt_pca_autoSVD.R +++ b/R/gt_pca_autoSVD.R @@ -50,6 +50,10 @@ #' #' Note: rather than accessing these elements directly, it is better to use #' `tidy` and `augment`. See [`gt_pca_tidiers`]. +#' Note: If you encounter 'Error in rollmean(): Parameter 'size' is too large.' +#' roll_size is exceeding the number of variants on at least one of your chromosomes. +#' If you have pre-specified roll_size, you will need to reduce this parameter. +#' If not, try specifying a reduced 'roll_size' to avoid this error. #' #' @export @@ -72,7 +76,13 @@ gt_pca_autoSVD <- function(x, k = 10, gt_set_imputed(x, set = TRUE) on.exit(gt_set_imputed(x, set = FALSE)) } - #browser() + + if(is.null(roll_size)){ + message("If you encounter 'Error in rollmean(): Parameter 'size' is too large.' + roll_size exceeds the number of variants on at least one of your chromosomes. + Try reducing 'roll_size' to avoid this error.") + } + X <- attr(x$genotypes,"bigsnp") # convenient pointer x_ind_col <- show_loci(x)$big_index x_ind_row <- vctrs::vec_data(x$genotypes) @@ -110,7 +120,6 @@ gt_pca_autoSVD <- function(x, k = 10, verbose = verbose) # add names to the scores (to match them to data later) rownames(this_svd$u)<-x$id - #browser() this_svd$method <- "autoSVD" this_svd$call <- match.call() # subset the loci table to have only the snps of interest diff --git a/R/pairwise_king.R b/R/pairwise_king.R index 5763c7a7..43f6dfed 100644 --- a/R/pairwise_king.R +++ b/R/pairwise_king.R @@ -2,7 +2,8 @@ #' #' This function computes the KING-robust estimator of kinship. #' -#' Note that monomorphic sites are currently considered. What does PLINK do??? +#' Note that monomorphic sites are currently considered. Remove monomorphic sites +#' before running pairwise_king if this is a concern. #' @param x a `gen_tibble` object. #' @param as_matrix boolean, determining whether the results should be a square symmetrical matrix (TRUE), #' or a tidied tibble (FALSE, the default) diff --git a/R/rbind_dry_run.R b/R/rbind_dry_run.R index c7e58498..c7941965 100644 --- a/R/rbind_dry_run.R +++ b/R/rbind_dry_run.R @@ -54,7 +54,6 @@ rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE # rename the alleles ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref") target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref") - #browser() rbind_dry_run_df(ref_df = ref_df, target_df = target_df, flip_strand = flip_strand, @@ -115,8 +114,6 @@ rbind_dry_run_df <- function(ref_df, target_df, flip_strand, quiet){ to_flip <- to_flip & !ambiguous_sub to_swap <- to_swap & !ambiguous_sub } - # browser() - # now we create the two reporting data.frames # note that they include all loci (including the ones we dropped because they # did not exist in one of the datasets) @@ -153,7 +150,6 @@ rbind_dry_run_df <- function(ref_df, target_df, flip_strand, quiet){ resolve_missing_alleles <- function(missing_table, other_table){ - #browser() missing_replacements <- rep(NA, nrow(missing_table)) for (i_row in which(missing_table$allele_1=="0")){ # get non-missing allele diff --git a/R/rbind_gen_tibble.R b/R/rbind_gen_tibble.R index d6926054..2531f8e0 100644 --- a/R/rbind_gen_tibble.R +++ b/R/rbind_gen_tibble.R @@ -154,7 +154,6 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position vctrs::vec_data(ref$genotypes) #and finally append the loci table indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes)) - #browser() #new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!! new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference diff --git a/man/gt_pca.Rd b/man/gt_pca.Rd index 0b7e4d8a..c26d006d 100644 --- a/man/gt_pca.Rd +++ b/man/gt_pca.Rd @@ -18,4 +18,8 @@ thousands markers, or larger). \details{ NOTE: using gt_pca_autoSVD with a small dataset will likely cause an error, see man page for details. + +NOTE: monomorphic markers must be removed before PCA is computed. The error +message 'Error: some variables have zero scaling; remove them before attempting +to scale.' indicates that monomorphic markers are present. } diff --git a/man/gt_pca_autoSVD.Rd b/man/gt_pca_autoSVD.Rd index ad94dda9..86a33be5 100644 --- a/man/gt_pca_autoSVD.Rd +++ b/man/gt_pca_autoSVD.Rd @@ -80,6 +80,10 @@ A named list (an S3 class "big_SVD") of Note: rather than accessing these elements directly, it is better to use \code{tidy} and \code{augment}. See \code{\link{gt_pca_tidiers}}. +Note: If you encounter 'Error in rollmean(): Parameter 'size' is too large.' +roll_size is exceeding the number of variants on at least one of your chromosomes. +If you have pre-specified roll_size, you will need to reduce this parameter. +If not, try specifying a reduced 'roll_size' to avoid this error. } \description{ This function performs Principal Component Analysis on a \code{gen_tibble}, diff --git a/man/pairwise_king.Rd b/man/pairwise_king.Rd index ff9f2070..0875df88 100644 --- a/man/pairwise_king.Rd +++ b/man/pairwise_king.Rd @@ -23,5 +23,6 @@ but will tax memory.} This function computes the KING-robust estimator of kinship. } \details{ -Note that monomorphic sites are currently considered. What does PLINK do??? +Note that monomorphic sites are currently considered. Remove monomorphic sites +before running pairwise_king if this is a concern. } diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 94da77c6..6ff73210 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -731,9 +731,6 @@ test_that("gen_tibble family.ID from vcf",{ expect_true(all(fam$V1 == pop_b_vcf_gt$id)) expect_true(all(fam$V1 == fam$V2)) }) - - - # Windows prevents the deletion of the backing file. It's something to do with the memory mapping # library used by bigsnpr # test_that("on error, we remove the old files",{ diff --git a/tests/testthat/test_gt_pca.R b/tests/testthat/test_gt_pca.R index 857d018e..b2a8a09a 100644 --- a/tests/testthat/test_gt_pca.R +++ b/tests/testthat/test_gt_pca.R @@ -29,21 +29,16 @@ test_that("fit_gt_pca_and_predict",{ }) -# TODO we should test gt_pca_autoSVD(), as the loci have to be subset within -# the object - -test_that("gt_pca_autoSVD problem ",{ +test_that("adjusting roll_size fixes gt_pca_autoSVD problem ",{ # @TODO remove this hack when the patch makes it through to cran - library(bigsnpr) - # Generate example_indiv_meta data frame individual_ids <- paste0("indiv", 1:200) populations <- sample(c("pop1", "pop2", "pop3", "pop4"), size = 200, replace = TRUE) example_indiv_meta <- data.frame(id = individual_ids, population = populations) - # Generate example_genotypes matrix + # Generate example_genotypes matrix (no missingness) values <- c(0, 1, 2) genotype_matrix <- sample(values, size = 200 * 500, replace = TRUE) example_genotypes <- matrix(genotype_matrix, nrow = 200, ncol = 500) @@ -65,55 +60,43 @@ test_that("gt_pca_autoSVD problem ",{ allele_alt = allele_alts ) - + # create gen_tibble test <- gen_tibble(x = example_genotypes, loci = example_loci, indiv_meta = example_indiv_meta, quiet = TRUE) - #works fine - #test_pca <- test %>% gt_pca_autoSVD() - - - + # gen_tibble with no missingness runs + test_pca <- test %>% gt_pca_autoSVD() # Now try with imputed data + library(bigsnpr) bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr") missing_gt <- gen_tibble(bed_file, backingfile = tempfile("missing_"),quiet=TRUE) - expect_error(missing_gt %>% gt_pca_partialSVD(), + expect_error(missing_gt %>% gt_pca_autoSVD(), "You can't have missing") missing_gt <- gt_impute_simple(missing_gt, method = "mode") - expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.") - - #Issue for bigutilsr below: - #https://github.com/privefl/bigutilsr/issues/2 - - #Error in bigutilsr::rollmean(S.col[ind], roll.size) : - #Parameter 'size' is too large. - - #the error persists when manually adjusting roll_size - - expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(roll_size = 10, verbose = FALSE), "Parameter 'size' is too large.") - expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(roll_size = 1000, verbose = FALSE), "Parameter 'size' is too large.") - - #until adjusting roll_size to 0 - #missing_gt %>% gt_pca_autoSVD(roll_size = 0) - #or strangely up to 7? - #missing_gt %>% gt_pca_autoSVD(roll_size = 7) + # we encounter roll_size error + expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.") + # adjusting roll_size fixes the error + test_pca_roll0 <- missing_gt %>% gt_pca_autoSVD(roll_size = 0) + expect_s3_class(test_pca_roll0, "gt_pca") + test_pca_roll7 <- missing_gt %>% gt_pca_autoSVD(roll_size = 7) + expect_s3_class(test_pca_roll7, "gt_pca") - #loci info is the same - #summary(test %>% show_loci()) - #summary(missing_gt %>% show_loci()) #testing with the families dataset too bed_file <- system.file("extdata/related", "families.bed", package = "tidypopgen") - missing_gt <- gen_tibble(bed_file, backingfile = tempfile("families"),quiet=TRUE, valid_alleles = c("2","1")) - expect_error( missing_gt %>% gt_pca_partialSVD(), + families <- gen_tibble(bed_file, backingfile = tempfile("families"),quiet=TRUE, valid_alleles = c("2","1")) + expect_error( families %>% gt_pca_autoSVD(), "You can't have missing") - missing_gt <- gt_impute_simple(missing_gt, method = "mode") + families <- gt_impute_simple(families, method = "mode") # the same error occurs - expect_error(missing_pca <- missing_gt %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.") + expect_error(missing_pca <- families %>% gt_pca_autoSVD(verbose = FALSE), "Parameter 'size' is too large.") + # adjusting roll_size fixes the error + test_pca_families_roll7 <- families %>% gt_pca_autoSVD(roll_size = 7) + expect_s3_class(test_pca_families_roll7, "gt_pca") }) @@ -138,3 +121,25 @@ test_that("fit_gt_pca_and_predict_splitted_data",{ lsq_pred <- predict(modern_pca, new_data = ancient_gt, project_method = "least_squares") expect_true(all(dim(lsq_pred)==c(20,2))) }) + +test_that("PCA functions work with loci out of order",{ + bed_file <- system.file("extdata", "example-missing.bed", package = "bigsnpr") + missing_gt <- gen_tibble(bed_file, backingfile = tempfile("missing_"),quiet=TRUE) + missing_gt <- gt_impute_simple(missing_gt, method = "mode") + missing_part_pca1 <- missing_gt %>% gt_pca_partialSVD() + missing_rand_pca1 <- missing_gt %>% gt_pca_randomSVD() + + # now shuffle the loci + loci <- missing_gt %>% show_loci() + loci <- loci[sample(nrow(loci)),] + show_loci(missing_gt) <- loci + + # Rerun PCA + missing_part_pca2 <- missing_gt %>% gt_pca_partialSVD() + missing_rand_pca2 <- missing_gt %>% gt_pca_randomSVD() + + expect_equal(missing_part_pca1$loadings, missing_part_pca2$loadings) + expect_equal(missing_rand_pca1$loadings, missing_rand_pca2$loadings) +}) + + diff --git a/vignettes/a02_qc.Rmd b/vignettes/a02_qc.Rmd index 5f329df3..861dd5ad 100644 --- a/vignettes/a02_qc.Rmd +++ b/vignettes/a02_qc.Rmd @@ -26,6 +26,62 @@ data <- gen_tibble(system.file("extdata/related/families.bed", valid_alleles = c("1","2")) ``` + +#Quality control for individuals + +```{r} +individual_report <- qc_report_indiv(data) +summary(individual_report) +``` + +The output of `qc_report_indiv` supplies observed heterozygosity per individual, and rate of missingness per individual as standard. + +These data can also be visualised using autoplot: + +```{r, fig.alt = "Scatter plot of missingness proportion and observed heterozygosity for each individual"} +autoplot(individual_report) +``` + +Here we can see that most individuals have low missingness. If we wanted to filter individuals to remove those with more than 4.5% of their genotypes missing, we can use `filter`. + +```{r} +data <- data %>% filter(indiv_missingness(genotypes)<0.045) +nrow(data) +``` + +And if we wanted to remove outliers with particularly high or low heterozygosity, we can again do so by using `filter`. As an example, here we remove observations that lie more than 3 standard deviations from the mean. + +```{r} +mean_val <- mean(individual_report$het_obs) +sd_val <- stats::sd(individual_report$het_obs) + +lower <- mean_val - 3*(sd_val) +upper <- mean_val + 3*(sd_val) + +data <- data %>% filter(indiv_het_obs(genotypes) > lower) +data <- data %>% filter(indiv_het_obs(genotypes) < upper) +nrow(data) +``` + +Next, we can look at relatedness within our sample. If the parameter `kings_threshold` is provided to `qc_report_indiv()`, then the report also calculates a KING coefficient of relatedness matrix using the sample. The `kings_threshold` is used to provide an output of the largest possible group with no related individuals in the third column `to_keep`. This boolean column recommends which individuals to remove (FALSE) and to keep (TRUE) to achieve an unrelated sample. + +```{r} +individual_report <- qc_report_indiv(data, kings_threshold = 0.177) +summary(individual_report) +``` + +We can remove the recommended individuals by using: + +```{r} +data <- data %>% filter(id %in% individual_report$id & individual_report$to_keep == TRUE) +``` + +We can now view a summary of our cleaned data set again, showing that our data has reduced from 12 to 9 individuals. + +```{r} +summary(data) +``` + # Quality control for loci ```{r} @@ -41,14 +97,14 @@ autoplot(loci_report, type = "overview") Using 'overview' provides an Upset plot, which is designed to show the intersection of different sets in the same way as a Venn diagram. SNPs can be divided into 'sets' that each pass predefined quality control threshold; a set of SNPs with missingness under a given threshold, a set of SNPs with MAF above a given threshold, and a set of SNPs with a Hardy-Weinberg exact p-value that falls above a given significance level. -The upset plot then visualises our 961 SNPs within their respective sets. The number above the first bar indicates that, of our total SNPs, 609 occur in all 3 sets, meaning 609 SNPs pass all of our automatic QC measures. The combined total of the first and second bars represents the number of SNPs that pass our MAF and HWE thresholds, here, 956. - -The thresholds for each parameter, (level of missingness that is accepted etc) can be adjusted using the parameters provided in autoplot. For example: +The thresholds for each parameter, (percentage of missingness that is accepted, minor allele frequency cutoff, and Hardy-Weinberg equilibrium p-value) can be adjusted using the parameters provided in autoplot. For example: ```{r, fig.alt = "Upset plot as above, with adjusted thresholds"} autoplot(loci_report, type = "overview", miss_threshold = 0.03, maf_threshold = 0.02, hwe_p = 0.01) ``` +The upset plot then visualises our 961 SNPs within their respective sets. The number above the second bar indicates that 262 SNPs occur in all 3 sets, meaning 262 SNPs pass all of our QC thresholds. The combined total of the first and second bars represents the number of SNPs that pass our MAF and HWE thresholds, here 939 SNPs. + To examine each QC measure in further detail, we can plot a different summary panel. ```{r, fig.alt = "Four panel plot, containing: a histogram of the proportion of missing data for snps with minor allele freqency above the threshold, a histogram of the proportion of missing data for snps with minor allele freqency below the threshold, a histogram of HWE exact test p-values, and a histogram of significant HWE exact test p-values"} @@ -87,70 +143,13 @@ Finally, we may want to remove SNPs that show significant deviation from Hardy-W autoplot(loci_report,type = "significant hwe", hwe_p = 0.01) ``` -Few SNPs in our data are significant, however there may be circumstances where we would want to cut out the most extreme cases, if these data were real, these cases could indicate genotyping errors. +None of the SNPs in our data are significant, however there may be circumstances where we would want to cut out the most extreme cases, if these data were real, these cases could indicate genotyping errors. ```{r} data <- data %>% select_loci_if(loci_hwe(genotypes)>0.01) count_loci(data) ``` -Once we have quality controlled the SNPs in our data, we can turn to individual samples. - -#Quality control for individuals - -```{r} -individual_report <- qc_report_indiv(data) -summary(individual_report) -``` - -The output of `qc_report_indiv` supplies observed heterozygosity per individual, and rate of missingness per individual as standard. - -These data can also be visualised using autoplot: - -```{r, fig.alt = "Scatter plot of missingness proportion and observed heterozygosity for each individual"} -autoplot(individual_report) -``` - -Here we can see that most individuals have low missingness. If we wanted to filter individuals to remove those with more than 3% of their genotypes missing, we can use `filter`. - -```{r} -data <- data %>% filter(indiv_missingness(genotypes)<0.03) -nrow(data) -``` - -And if we wanted to remove outliers with particularly high or low heterozygosity, we can again do so by using `filter`. As an example, here we remove observations that lie more than 3 standard deviations from the mean. - -```{r} -mean_val <- mean(individual_report$het_obs) -sd_val <- stats::sd(individual_report$het_obs) - -lower <- mean_val - 3*(sd_val) -upper <- mean_val + 3*(sd_val) - -data <- data %>% filter(indiv_het_obs(genotypes) > lower) -data <- data %>% filter(indiv_het_obs(genotypes) < upper) -nrow(data) -``` - -Next, we can look at relatedness within our sample. If the parameter `kings_threshold` is provided to `qc_report_indiv()`, then the report also calculates a KING coefficient of relatedness matrix using the sample. The `kings_threshold` is used to provide an output of the largest possible group with no related individuals in the third column `to_keep`. This boolean column recommends which individuals to remove (FALSE) and to keep (TRUE) to achieve an unrelated sample. - -```{r} -individual_report <- qc_report_indiv(data, kings_threshold = 0.177) -summary(individual_report) -``` - -We can remove the recommended individuals by using: - -```{r} -data <- data %>% filter(id %in% individual_report$id & individual_report$to_keep == TRUE) -``` - -We can now view a summary of our cleaned data set again, showing that our data has reduced from 12 to 10 individuals. - -```{r} -summary(data) -``` - # Linkage Disequilibrium For further analyses, it may be necessary to control for linkage in the data set. tidypopgen provides LD clumping. This option is similar to the --indep-pairwise flag in plink, but results in a more even distribution of loci when compared to LD pruning. @@ -194,7 +193,7 @@ For some quality control measures, if you have a gen_tibble that includes multip First, lets add some imaginary population data to our gen_tibble: ```{r} -data <- data %>% mutate(population = rep(c("A", "B"), each = 5)) +data <- data %>% mutate(population = c(rep("A", 4), rep("B", 5))) ``` We can then group by population and run quality control on each group: @@ -214,3 +213,13 @@ head(grouped_individual_report) ``` This is important when we have related individuals, as background population structure can affect the filtering of relatives. + +# Grouped functions + +It is also possible to run `loci` and `indiv` functions on grouped data. This is useful when you want to run the same quality control on each group of data, but don't want to split the data into separate gen_tibbles. + +```{r} +loci_maf_grouped <- data %>% group_by(population) %>% loci_maf() +``` + +Grouped functions are built for efficiency and surpass the use of applying a function with `group_map`.