Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Admixtools #39

Merged
merged 16 commits into from
May 17, 2024
Merged
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9011
Version: 0.0.0.9012
Authors@R:
c(person("Andrea", "Manica", email = "[email protected]",
role = c("aut", "cre"),
Expand Down Expand Up @@ -37,6 +37,7 @@ Imports:
vctrs
Suggests:
adegenet,
admixtools,
broom,
hierfstat,
knitr,
Expand All @@ -46,6 +47,8 @@ Suggests:
readr,
testthat (>= 3.0.0),
vcfR
Remotes:
uqrmaie1/admixtools
VignetteBuilder: knitr
Config/testthat/edition: 3
LinkingTo:
Expand Down
6 changes: 2 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ S3method(loci_missingness,tbl_df)
S3method(loci_missingness,vctrs_bigSNP)
S3method(loci_names,tbl_df)
S3method(loci_names,vctrs_bigSNP)
S3method(loci_sums,grouped_df)
S3method(loci_sums,tbl_df)
S3method(loci_sums,vctrs_bigSNP)
S3method(loci_transitions,grouped_df)
S3method(loci_transitions,tbl_df)
S3method(loci_transitions,vctrs_bigSNP)
Expand Down Expand Up @@ -85,6 +82,7 @@ export(gt_as_plink)
export(gt_cluster_pca)
export(gt_cluster_pca_best_k)
export(gt_dapc)
export(gt_extract_f2)
export(gt_get_file_names)
export(gt_has_imputed)
export(gt_impute_simple)
Expand All @@ -107,7 +105,6 @@ export(loci_ld_clump)
export(loci_maf)
export(loci_missingness)
export(loci_names)
export(loci_sums)
export(loci_transitions)
export(loci_transversions)
export(pairwise_allele_sharing)
Expand All @@ -133,6 +130,7 @@ importFrom(Rcpp,sourceCpp)
importFrom(generics,augment)
importFrom(generics,tidy)
importFrom(ggplot2,autoplot)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(rlang,":=")
useDynLib(tidypopgen, .registration = TRUE)
16 changes: 16 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

gt_grouped_alt_freq_diploid <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
.Call(`_tidypopgen_gt_grouped_alt_freq_diploid`, BM, rowInd, colInd, groupIds, ngroups, ncores)
}

gt_grouped_alt_freq_pseudohap <- function(BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores) {
.Call(`_tidypopgen_gt_grouped_alt_freq_pseudohap`, BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores)
}

gt_grouped_missingness <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
.Call(`_tidypopgen_gt_grouped_missingness`, BM, rowInd, colInd, groupIds, ngroups, ncores)
}

gt_grouped_summaries <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
.Call(`_tidypopgen_gt_grouped_summaries`, BM, rowInd, colInd, groupIds, ngroups, ncores)
}

SNPHWE2 <- function(obs_hets, obs_hom1, obs_hom2, midp) {
.Call(`_tidypopgen_SNPHWE2`, obs_hets, obs_hom1, obs_hom2, midp)
}
Expand Down
13 changes: 11 additions & 2 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
#' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
#' here the format
#' @param x can be:
#' - a string giving the path to a PLINK BED or PED file. The correspective
#' BIM and FAM fiel for the BED, or MAP for PED are expected to be in the same
#' - a string giving the path to a PLINK BED or PED file. The associated
#' BIM and FAM files for the BED, or MAP for PED are expected to be in the same
#' directory and have the same file name.
#' - a string giving the path to a RDS file storing a `bigSNP` object from
#' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
#' - a string giving the path to a vcf file. Note that we currently read the whole
#' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic
#' SNPs will be considered.
#' - a string giving the path to a packedancestry .geno file. The associated
#' .ind and .snp files are expected to be in the same directory and share the
#' same file name prefix.
#' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
#' allele.
#' @param indiv_meta a list, data.frame or tibble with compulsory columns 'id'
Expand Down Expand Up @@ -91,6 +94,12 @@ gen_tibble.character <-
missing_alleles= missing_alleles,
backingfile = backingfile,
quiet = quiet)
} else if (tolower(file_ext(x))=="geno"){
gen_tibble_packedancestry(x = x, ...,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile,
quiet = quiet)
} else {
stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
}
Expand Down
41 changes: 41 additions & 0 deletions R/gen_tibble_packedancestry.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#A function to read geno packedancestrymap files
gen_tibble_packedancestry <- function(x, ...,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL, quiet = FALSE) {
# Substitute .ped with .map
map_file <- sub("\\.geno$", ".snp", x)
if (!file.exists(map_file)){
stop("snp file ",map_file," does not exist")
}
ind_file <- sub("\\.geno$", ".ind", x)
if (!file.exists(ind_file)){
stop("ind file ",ind_file," does not exist")
}

if (is.null(backingfile)){
backingfile <- sub("\\.geno$", "", x)
}
if (file_ext(backingfile)=="bk"){
backingfile <- bigstatsr::sub_bk(backingfile,"")
}

res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x),
transpose = TRUE,
...)
names(res$ind)<-c("id","sex","population")
#TODO check that allele_ref and allele_alt are not swapped
names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')

# using the gen_tibble.matrix method
new_gen_tbl <- gen_tibble(x = res$geno,
indiv_meta = res$ind,
loci = res$snp,
backingfile = backingfile,
quiet=quiet,
valid_alleles = valid_alleles,
missing_alleles = missing_alleles)

return(new_gen_tbl)

}
235 changes: 235 additions & 0 deletions R/gt_extract_f2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
#' Compute and store blocked f2 statistics for ADMIXTOOLS 2
#'
#' This function prepares data for various *ADMIXTOOLS 2* functions fro the
#' package *ADMIXTOOLS 2*. It
#' takes a [`gen_tibble`],
#' computes allele frequencies and blocked f2-statistics,
#' and writes the results to `outdir`. It is equivalent to
#' `admixtools::extract_f2()`.
#' @param .x a [`gen_tibble`]
#' @param outdir Directory where data will be stored.
#' @param blgsize SNP block size in Morgan. Default is 0.05 (5 cM). If `blgsize`
#' is 100 or greater, if will be interpreted as base pair distance rather than
#' centimorgan distance.
#' @param maxmem Maximum amount of memory to be used. If the required amount
#' of memory exceeds `maxmem`, allele frequency data will be split into blocks,
#' and the computation will be performed separately on each block pair.
#' This doesn't put a precise cap on the amount of memory used (it used to
#' at some point). Set this parameter to lower values if you run out of memory
#' while running this function. Set it to higher values if this function is
#' too slow and you have lots of memory.
#' @param maxmiss Discard SNPs which are missing in a fraction of populations
#' higher than `maxmiss`
#' @param minmaf Discard SNPs with minor allele frequency less than `minmaf`
#' @param maxmaf Discard SNPs with minor allele frequency greater than
#' than `maxmaf`
#' @param minac2 Discard SNPs with allele count lower than 2 in any population
#' (default `FALSE`). This option should be set to `TRUE` when computing
#' f3-statistics where one population consists mostly of pseudohaploid samples.
#' Otherwise heterozygosity estimates and thus f3-estimates can be biased.
#' `minac2 == 2` will discard SNPs with allele count lower than 2 in any
#' non-singleton population (this option is experimental and is based on the
#' hypothesis that using SNPs with allele count lower than 2 only leads to
#' biases in non-singleton populations). Note that, While the `minac2` option
#' discards
#' SNPs with allele count lower than 2 in any population, the \code{qp3pop}
#' function will only discard SNPs with allele count lower than 2 in the
#' first (target) population (when the first argument is the prefix of
#' a genotype file; i.e. it is applied directly to a genotype file, not via
#' precomputing f2 from a [`gen_tibble`]).
#' @param outpop Keep only SNPs which are heterozygous in this population
#' @param outpop_scale Scale f2-statistics by the inverse `outpop`
#' heteroygosity (`1/(p*(1-p))`). Providing `outpop` and setting
#' `outpop_scale` to `TRUE` will give the same results as the original
#' *qpGraph* when the `outpop` parameter has been set, but it has the
#' disadvantage of treating one population different from the others. This
#' may limit the use of these f2-statistics for other models.
#' @param transitions Set this to `FALSE` to exclude transition SNPs
#' @param transversions Set this to `FALSE` to exclude transversion SNPs
#' @param overwrite Overwrite existing files in `outdir`
#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are
#' usually coded as `0` or `2`, even though only one allele is observed.
#' `adjust_pseudohaploid` ensures that the observed allele count increases
#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples
#' that don't have any genotypes coded as `1` among the first 1000 SNPs are
#' automatically identified as pseudohaploid. This leads to slightly more
#' accurate estimates of f-statistics. Setting this parameter to `FALSE`
#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* `
#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n`
#' will check the first `n` SNPs instead of the first 1000 SNPs.
#' @param fst Write files with pairwise FST for every population pair.
#' Setting this to FALSE can make `extract_f2` faster and will require less memory.
#' @param afprod Write files with allele frequency products for every
#' population pair. Setting this to FALSE can make `extract_f2` faster and
#' will require less memory.
#' @param poly_only Specify whether SNPs with identical allele frequencies in
#' every population should be discarded (`poly_only = TRUE`), or whether they
#' should be used (`poly_only = FALSE`). By default (`poly_only = c("f2")`),
#' these SNPs will be used to compute FST and allele frequency products, but
#' not to compute f2 (this is the default option in the original ADMIXTOOLS).
#' @param apply_corr Apply small-sample-size correction when computing
#' f2-statistics (default `TRUE`)
#' @param n_cores Parallelize computation across `n_cores` cores.
#' @param quiet Suppress printing of progress updates
#' @return SNP metadata (invisibly)
#' @export

gt_extract_f2<- function(.x , outdir=NULL, blgsize = 0.05, maxmem = 8000,
maxmiss = 0, minmaf = 0, maxmaf = 0.5, minac2 = FALSE, outpop = NULL, outpop_scale = TRUE,
transitions = TRUE, transversions = TRUE,
overwrite = FALSE,
adjust_pseudohaploid = TRUE, fst = TRUE, afprod = TRUE,
poly_only = c('f2'), apply_corr = TRUE, n_cores = 1, quiet = FALSE) {
if (!requireNamespace("admixtools", quietly = TRUE)) {
stop(
"to use this function, first install package 'admixtools' with\n",
"devtools::install_github('uqrmaie1/admixtools')")
}

# parameters that don't make sense with a gen_tibble
inds = NULL
pops = NULL
pops2 = NULL
auto_only = FALSE
keepsnps = NULL

if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
}
# if no outdir is given, create a subdirectory f2 in the path of the gen_tibble rds
if (is.null(outdir)){
outdir <- file.path(dirname(.gt_get_bigsnp(.x)$genotypes$rds),"f2")
}

verbose <- !quiet
afdat = gt_to_aftable(.x,
adjust_pseudohaploid = adjust_pseudohaploid,
n_cores = n_cores)

if(is.null(inds)) pops = union(pops, pops2)

afdat %<>% discard_from_aftable(maxmiss = maxmiss, minmaf = minmaf, maxmaf = maxmaf, minac2 = minac2, outpop = outpop,
transitions = transitions, transversions = transversions,
keepsnps = keepsnps, auto_only = auto_only, poly_only = FALSE)
afdat$snpfile %<>% mutate(poly = as.logical(cpp_is_polymorphic(afdat$afs)))
if(sum(afdat$snpfile$poly) == 0) stop('There are no informative SNPs!')

if(verbose) message(paste0(nrow(afdat$afs), ' SNPs remain after filtering. ',
sum(afdat$snpfile$poly),' are polymorphic.\n'))

if(isTRUE(poly_only)) poly_only = c('f2', 'ap', 'fst')
arrs = afs_to_f2_blocks(afdat, outdir = outdir, overwrite = overwrite, maxmem = maxmem, poly_only = poly_only,
pops1 = pops, pops2 = pops2, outpop = if(outpop_scale) outpop else NULL,
blgsize = blgsize, afprod = afprod, fst = fst, apply_corr = apply_corr,
n_cores = n_cores, verbose = verbose)

if(is.null(outdir)) return(arrs)

if(verbose) message(paste0('Data written to ', outdir, '/\n'))
invisible(afdat$snpfile)
}


#' Create an allele frequency table (aftable) for admixtools
#'
#' This function is equivalent to `anygeno_to_aftable()`, but the aftable is
#' created directly from the `gen_tibble`.
#' @param .x the [`gen_tibble`]
#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are
#' usually coded as `0` or `2`, even though only one allele is observed.
#' `adjust_pseudohaploid` ensures that the observed allele count increases
#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples
#' that don't have any genotypes coded as `1` among the first 1000 SNPs are
#' automatically identified as pseudohaploid. This leads to slightly more
#' accurate estimates of f-statistics. Setting this parameter to `FALSE`
#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* `
#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n`
#' will check the first `n` SNPs instead of the first 1000 SNPs.
#' @param n_cores Parallelize computation across `n_cores` cores.
#' @returns a list of 3 elements: afs, counts, and snp
#' @keywords internal

gt_to_aftable <- function(.x, adjust_pseudohaploid= TRUE, n_cores = bigstatsr::nb_cores()){
if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
}
geno_fbm <- .gt_get_bigsnp(.x)$genotypes

# TODO we need a version with mixed ploidy that allows to account for haploids (or pseudohaploids)
if (adjust_pseudohaploid){
if (is.numeric(adjust_pseudohaploid)){
n_test <- adjust_pseudohaploid
} else {
n_test <- 1000
}
ploidy <- identify_pseudohaploids(.x, n_test)
aftable <- gt_grouped_alt_freq_pseudohap(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x),
colInd = .gt_bigsnp_cols(.x),
groupIds = dplyr::group_indices(.x)-1,
ngroups = max(dplyr::group_indices(.x)),
ploidy = ploidy,
ncores = n_cores)

} else {
aftable <- gt_grouped_alt_freq_diploid(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x),
colInd = .gt_bigsnp_cols(.x),
groupIds = dplyr::group_indices(.x)-1,
ngroups = max(dplyr::group_indices(.x)),
ncores = n_cores)

}
names(aftable)<-c("afs","counts")

.group_levels = .x %>% group_keys() %>% pull(1)
dimnames(aftable$afs)<-list(loci_names(.x),.group_levels)
dimnames(aftable$counts)<-list(loci_names(.x),.group_levels)

loci_new_names <- c(SNP = "name", CHR = "chromosome", POS="position",cm="genetic_dist",
A1 = "allele_alt",A2 = "allele_ref")
snp <- show_loci(.x) %>% select(-all_of("big_index")) %>%
rename(dplyr::all_of(loci_new_names)) %>%
dplyr::relocate(all_of(c("CHR", "SNP", "cm", "POS", "A1", "A2")))
snp$A1[is.na(snp$A1)]<-"0"
class(snp)<-c("spec_tbl_df",class(snp))
# we are missing column specs, which are generated by readr::col_spec
# do we need to make sure that CHR and SNP are character?
snp$CHR <- as.character(snp$CHR)
snp$SNP <- as.character(snp$SNP)

aftable$snpfile <- snp
return(aftable)
}




# import some functions that are not exported by admixtools
discard_from_aftable <- function(...){
utils::getFromNamespace("discard_from_aftable", "admixtools")(...)}

afs_to_f2_blocks <- function(...){
utils::getFromNamespace("afs_to_f2_blocks", "admixtools")(...)}

cpp_is_polymorphic <- function(...) {
utils::getFromNamespace("cpp_is_polymorphic", "admixtools")(...)}

#' Identify pseudohaploids
#'
#' Pseudohaploids are coded as all homozygotes; we find them by checking the
#' first 'n_test' loci and call a pseudohaploid if they have zero heterozygosity
#' (this is the same strategy employed in admixtools)
#' @param x the gen_tibble
#' @param n_test the number of loci being tested
#' @return a numeric vector of ploidy
#' @keywords internal
identify_pseudohaploids <- function (x, n_test = 1000){
if (n_test>count_loci(x)){
n_test <- count_loci(x)
}
sub_x <- select_loci(x,.sel_arg = dplyr::all_of(c(1:n_test))) %>%
dplyr::ungroup()
het_obs <- indiv_het_obs(sub_x)
ploidy <-rep(2,nrow(x))
ploidy[het_obs==0]<-1
return(ploidy)
}
Loading