Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sum stats #56

Merged
merged 21 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9017
Version: 0.0.0.9018
Authors@R:
c(person("Evie", "Carter", role = c("aut")),
person("Andrea", "Manica", email = "[email protected]",
Expand Down
7 changes: 4 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,10 @@ S3method(dplyr_row_slice,grouped_gen_tbl)
S3method(gen_tibble,character)
S3method(gen_tibble,matrix)
S3method(group_by,gen_tbl)
S3method(indiv_het_obs,grouped_df)
S3method(indiv_het_obs,tbl_df)
S3method(indiv_het_obs,vctrs_bigSNP)
S3method(indiv_missingness,grouped_df)
S3method(indiv_missingness,tbl_df)
S3method(indiv_missingness,vctrs_bigSNP)
S3method(indiv_ploidy,grouped_df)
S3method(indiv_ploidy,tbl_df)
S3method(indiv_ploidy,vctrs_bigSNP)
S3method(loci_alt_freq,grouped_df)
Expand Down Expand Up @@ -124,6 +121,10 @@ export(pairwise_king)
export(pairwise_pop_fst)
export(pop_fis)
export(pop_fst)
export(pop_gene_div)
export(pop_global_stats)
export(pop_het_exp)
export(pop_het_obs)
export(q_matrix)
export(qc_report_indiv)
export(qc_report_loci)
Expand Down
3 changes: 3 additions & 0 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
#' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
#' an unusual structure.
#' @param n_cores the number of cores to use for parallel processing
#' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
#' 'C' and 'G'.
#' @param missing_alleles a vector of values in the BIM file/loci dataframe that
Expand Down Expand Up @@ -76,6 +77,7 @@ gen_tibble.character <-
function(x,
...,
parser = c("vcfR","cpp"),
n_cores = 1,
chunk_size = NULL,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
Expand Down Expand Up @@ -107,6 +109,7 @@ gen_tibble.character <-
quiet = quiet)
} else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size,
n_cores = n_cores,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile, quiet = quiet))
Expand Down
20 changes: 0 additions & 20 deletions R/gt_helper_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,23 +10,3 @@
.gt_get_bigsnp<-function(.x){
attr(.x$genotypes,"bigsnp")
}


# a developer function to create various count summaries of a population, used to
# compute more complex statistics (e.g. pairwise fst, etc.).
.gt_pop_freqs <- function(.x){
counts <- bigstatsr::big_counts( .gt_get_bigsnp(.x)$genotypes,
ind.row =.gt_bigsnp_rows(.x),
ind.col = .gt_bigsnp_cols(.x))
sums_alt <- apply(counts,2,function(x) x[2]+2*x[3])
n <- apply(counts,2,function(x) sum(x[1:3])*2)
sums_ref <- n - sums_alt
freq_alt <- sums_alt/n
freq_ref <- 1- freq_alt
het_obs <- apply(counts,2,function(x) x[2]/sum(x[1:3]))
return (list(
freq_alt = freq_alt,
freq_ref = freq_ref,
n = n,
het_obs = het_obs))
}
4 changes: 2 additions & 2 deletions R/gt_roh_window.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
#' @export
#'
#' @examples
#' # don't run the example
#' if (FALSE) {
#' # run the example only if we have the package installed
#' if (requireNamespace("detectRUNS", quietly = TRUE)) {
#' sheep_ped <- system.file("extdata", "Kijas2016_Sheep_subset.ped",
#' package="detectRUNS")
#' sheep_gt <- tidypopgen::gen_tibble(sheep_ped, backingfile = tempfile(),
Expand Down
10 changes: 5 additions & 5 deletions R/indiv_het_obs.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,8 @@ indiv_het_obs.vctrs_bigSNP <- function(.x, ...){
this_col_1_na[1,]/(ncol(X)-this_col_1_na[2,])
}

#' @export
#' @rdname indiv_het_obs
indiv_het_obs.grouped_df <- function(.x, ...){
.x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
}
# #' @export
# #' @rdname indiv_het_obs
# indiv_het_obs.grouped_df <- function(.x, ...){
# .x %>% mutate(indiv_het_obs = indiv_het_obs(.data$genotypes)) %>% summarise(het_obs = mean(indiv_het_obs))
# }
12 changes: 6 additions & 6 deletions R/indiv_missingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ indiv_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...){
row_na
}

#' @export
#' @rdname indiv_missingness
indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
.x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
summarise(het_obs = mean(indiv_missingness))
}
#' #' @export
#' #' @rdname indiv_missingness
#' indiv_missingness.grouped_df <- function(.x, as_counts = FALSE, ...){
#' .x %>% mutate(indiv_missingness = indiv_missingness(.data$genotypes, as_counts = as_counts)) %>%
#' summarise(het_obs = mean(indiv_missingness))
#' }
12 changes: 6 additions & 6 deletions R/indiv_ploidy.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,9 @@ indiv_ploidy.vctrs_bigSNP <- function(.x, ...){
}
}

#' @export
#' @rdname indiv_ploidy
indiv_ploidy.grouped_df <- function(.x, ...){
.x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
summarise(ploidy = mean(indiv_ploidy))
}
#' #' @export
#' #' @rdname indiv_ploidy
#' indiv_ploidy.grouped_df <- function(.x, ...){
#' .x %>% mutate(indiv_ploidy = indiv_ploidy(.data$genotypes)) %>%
#' summarise(ploidy = mean(indiv_ploidy))
#' }
82 changes: 76 additions & 6 deletions R/pop_fis.R
Original file line number Diff line number Diff line change
@@ -1,19 +1,89 @@
#' Compute population specific FIS
#'
#' This function computes population specific FIS (as computed by [hierfstat::fis.dosage()]).
#' This function computes population specific FIS, using either the approach of Nei 1987 (as computed by [hierfstat::basic.stats()]) or of Weir and Goudet 2017
#' (as computed by [hierfstat::fis.dosage()]).
#' @references
#' Nei M. (1987) Molecular Evolutionary Genetics. Columbia University Press
#' Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
#' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
#' @param include_global boolean determining whether, besides the population specific fis, a global
#' fis should be appended. Note that this will return a vector of n populations plus 1 (the global value)
#' @param allele_sharing_mat optional, the matrix of Allele Sharing returned by
#' @param method one of "Nei87" (based on Nei 1987, eqn 7.41) or "WG17" (for Weir and Goudet 2017) to compute FIS
#' @param by_locus boolean, determining whether FIS should be returned by locus(TRUE),
#' or as a single genome wide value (FALSE, the default). Note that this is only relevant for "Nei87",
#' as "WG17" always returns a single value.
#' @param include_global boolean determining whether, besides the population specific estiamtes, a global
#' estimate should be appended. Note that this will return a vector of n populations plus 1 (the global value),
#' or a matrix with n+1 columns if `by_locus=TRUE`.
#' @param allele_sharing_mat optional and only relevant for "WG17", the matrix of Allele Sharing returned by
#' [pairwise_allele_sharing()] with `as_matrix=TRUE`. As a number of statistics can be
#' derived from the Allele Sharing matrix,
#' it it sometimes more efficient to pre-compute this matrix.
#' @returns a vector of population specific fis (plus the global value if `include_global=TRUE`)
#' @export

pop_fis <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
pop_fis <- function(.x, method = c("Nei87", "WG17"), by_locus = FALSE, include_global=FALSE, allele_sharing_mat = NULL){
method <- match.arg(method)
if (method == "Nei87"){
if (!is.null(allele_sharing_mat)){
stop("allele_sharing_mat not relevant for Nei87")
}
pop_fis_nei87(.x, by_locus = by_locus, include_global = include_global)
} else if (method == "WG17"){
if (by_locus){
stop("by_locus not implemented for WG17")
}
pop_fis_wg17(.x, include_global=include_global, allele_sharing_mat = allele_sharing_mat)
}
}

pop_fis_nei87 <- function(.x, by_locus = FALSE, include_global=include_global,
n_cores = bigstatsr::nb_cores()){
stopifnot_diploid(.x)
# get the populations if it is a grouped gen_tibble
if (inherits(.x,"grouped_df")){
.group_levels <- .x %>% group_keys()
.group_ids <- dplyr::group_indices(.x)-1
} else { # create a dummy pop
.group_levels = tibble(population="pop")
.group_ids <- rep(0,nrow(.x))
}

# summarise population frequencies
pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes,
rowInd = .gt_bigsnp_rows(.x),
colInd = .gt_bigsnp_cols(.x),
groupIds = .group_ids,
ngroups = nrow(.group_levels),
ncores = n_cores)
sHo <-pop_freqs_df$het_obs
mHo <- apply(sHo, 1, mean, na.rm = TRUE)
n <-pop_freqs_df$n/2
# sum of squared frequencies
sp2 <- pop_freqs_df$freq_alt^2+pop_freqs_df$freq_ref^2
Hs <- (1 - sp2 - sHo/2/n)
Hs <- n/(n - 1) * Hs
Fis = 1 - sHo/Hs

colnames(Fis)<- .group_levels %>% dplyr::pull(1)
if (by_locus){
if (include_global){
global <- (.x %>% pop_global_stats(by_locus = TRUE, n_cores = n_cores))$Fis
Fis <- cbind(Fis, global)
}
} else {
Fis <- colMeans(Fis, na.rm = TRUE)
if (include_global){
global <- (.x %>% pop_global_stats(by_locus = FALSE, n_cores = n_cores))["Fis"]
names(global) <- "global"
Fis <- c(Fis, global)
}
}
return(Fis)
}


pop_fis_wg17 <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
stop (".x should be a grouped gen_tibble")
}
if (is.null(allele_sharing_mat)){
allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)
Expand Down
6 changes: 4 additions & 2 deletions R/pop_fst.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#' Compute population specific Fst
#'
#' This function computes population specific Fst (as computed by [hierfstat::fst.dosage()]).
#' This function computes population specific Fst, using the approach in Weir and Goudet 2017
#' (as computed by [hierfstat::fst.dosage()]).
#' @references Weir, BS and Goudet J (2017) A Unified Characterization of Population Structure and Relatedness. Genetics (2017) 206:2085
#' @param .x a grouped [`gen_tibble`] (as obtained by using [dplyr::group_by()])
#' @param include_global boolean determining whether, besides the population specific Fst, a global
#' Fst should be appended. Note that this will return a vector of n populations plus 1 (the global value)
Expand All @@ -12,7 +14,7 @@

pop_fst <- function(.x, include_global=FALSE, allele_sharing_mat = NULL){
if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
stop (".x should be a grouped gen_tibble")
}
if (is.null(allele_sharing_mat)){
allele_sharing_mat <- pairwise_allele_sharing(.x, as_matrix = TRUE)
Expand Down
Loading
Loading