Skip to content

Commit

Permalink
Merge pull request #39 from EvolEcolGroup/admixtools
Browse files Browse the repository at this point in the history
Admixtools
  • Loading branch information
dramanica authored May 17, 2024
2 parents 7a4bdf0 + 57bf196 commit f531d94
Show file tree
Hide file tree
Showing 38 changed files with 1,227 additions and 172 deletions.
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9011
Version: 0.0.0.9012
Authors@R:
c(person("Andrea", "Manica", email = "[email protected]",
role = c("aut", "cre"),
Expand Down Expand Up @@ -37,6 +37,7 @@ Imports:
vctrs
Suggests:
adegenet,
admixtools,
broom,
hierfstat,
knitr,
Expand All @@ -46,6 +47,8 @@ Suggests:
readr,
testthat (>= 3.0.0),
vcfR
Remotes:
uqrmaie1/admixtools
VignetteBuilder: knitr
Config/testthat/edition: 3
LinkingTo:
Expand Down
6 changes: 2 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ S3method(loci_missingness,tbl_df)
S3method(loci_missingness,vctrs_bigSNP)
S3method(loci_names,tbl_df)
S3method(loci_names,vctrs_bigSNP)
S3method(loci_sums,grouped_df)
S3method(loci_sums,tbl_df)
S3method(loci_sums,vctrs_bigSNP)
S3method(loci_transitions,grouped_df)
S3method(loci_transitions,tbl_df)
S3method(loci_transitions,vctrs_bigSNP)
Expand Down Expand Up @@ -85,6 +82,7 @@ export(gt_as_plink)
export(gt_cluster_pca)
export(gt_cluster_pca_best_k)
export(gt_dapc)
export(gt_extract_f2)
export(gt_get_file_names)
export(gt_has_imputed)
export(gt_impute_simple)
Expand All @@ -107,7 +105,6 @@ export(loci_ld_clump)
export(loci_maf)
export(loci_missingness)
export(loci_names)
export(loci_sums)
export(loci_transitions)
export(loci_transversions)
export(pairwise_allele_sharing)
Expand All @@ -133,6 +130,7 @@ importFrom(Rcpp,sourceCpp)
importFrom(generics,augment)
importFrom(generics,tidy)
importFrom(ggplot2,autoplot)
importFrom(magrittr,"%<>%")
importFrom(magrittr,"%>%")
importFrom(rlang,":=")
useDynLib(tidypopgen, .registration = TRUE)
16 changes: 16 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393

gt_grouped_alt_freq_diploid <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
.Call(`_tidypopgen_gt_grouped_alt_freq_diploid`, BM, rowInd, colInd, groupIds, ngroups, ncores)
}

gt_grouped_alt_freq_pseudohap <- function(BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores) {
.Call(`_tidypopgen_gt_grouped_alt_freq_pseudohap`, BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores)
}

gt_grouped_missingness <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
.Call(`_tidypopgen_gt_grouped_missingness`, BM, rowInd, colInd, groupIds, ngroups, ncores)
}

gt_grouped_summaries <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
.Call(`_tidypopgen_gt_grouped_summaries`, BM, rowInd, colInd, groupIds, ngroups, ncores)
}

SNPHWE2 <- function(obs_hets, obs_hom1, obs_hom2, midp) {
.Call(`_tidypopgen_SNPHWE2`, obs_hets, obs_hom1, obs_hom2, midp)
}
Expand Down
13 changes: 11 additions & 2 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
#' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
#' here the format
#' @param x can be:
#' - a string giving the path to a PLINK BED or PED file. The correspective
#' BIM and FAM fiel for the BED, or MAP for PED are expected to be in the same
#' - a string giving the path to a PLINK BED or PED file. The associated
#' BIM and FAM files for the BED, or MAP for PED are expected to be in the same
#' directory and have the same file name.
#' - a string giving the path to a RDS file storing a `bigSNP` object from
#' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
#' - a string giving the path to a vcf file. Note that we currently read the whole
#' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic
#' SNPs will be considered.
#' - a string giving the path to a packedancestry .geno file. The associated
#' .ind and .snp files are expected to be in the same directory and share the
#' same file name prefix.
#' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
#' allele.
#' @param indiv_meta a list, data.frame or tibble with compulsory columns 'id'
Expand Down Expand Up @@ -91,6 +94,12 @@ gen_tibble.character <-
missing_alleles= missing_alleles,
backingfile = backingfile,
quiet = quiet)
} else if (tolower(file_ext(x))=="geno"){
gen_tibble_packedancestry(x = x, ...,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile,
quiet = quiet)
} else {
stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
}
Expand Down
41 changes: 41 additions & 0 deletions R/gen_tibble_packedancestry.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#A function to read geno packedancestrymap files
gen_tibble_packedancestry <- function(x, ...,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL, quiet = FALSE) {
# Substitute .ped with .map
map_file <- sub("\\.geno$", ".snp", x)
if (!file.exists(map_file)){
stop("snp file ",map_file," does not exist")
}
ind_file <- sub("\\.geno$", ".ind", x)
if (!file.exists(ind_file)){
stop("ind file ",ind_file," does not exist")
}

if (is.null(backingfile)){
backingfile <- sub("\\.geno$", "", x)
}
if (file_ext(backingfile)=="bk"){
backingfile <- bigstatsr::sub_bk(backingfile,"")
}

res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x),
transpose = TRUE,
...)
names(res$ind)<-c("id","sex","population")
#TODO check that allele_ref and allele_alt are not swapped
names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')

# using the gen_tibble.matrix method
new_gen_tbl <- gen_tibble(x = res$geno,
indiv_meta = res$ind,
loci = res$snp,
backingfile = backingfile,
quiet=quiet,
valid_alleles = valid_alleles,
missing_alleles = missing_alleles)

return(new_gen_tbl)

}
235 changes: 235 additions & 0 deletions R/gt_extract_f2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
#' Compute and store blocked f2 statistics for ADMIXTOOLS 2
#'
#' This function prepares data for various *ADMIXTOOLS 2* functions fro the
#' package *ADMIXTOOLS 2*. It
#' takes a [`gen_tibble`],
#' computes allele frequencies and blocked f2-statistics,
#' and writes the results to `outdir`. It is equivalent to
#' `admixtools::extract_f2()`.
#' @param .x a [`gen_tibble`]
#' @param outdir Directory where data will be stored.
#' @param blgsize SNP block size in Morgan. Default is 0.05 (5 cM). If `blgsize`
#' is 100 or greater, if will be interpreted as base pair distance rather than
#' centimorgan distance.
#' @param maxmem Maximum amount of memory to be used. If the required amount
#' of memory exceeds `maxmem`, allele frequency data will be split into blocks,
#' and the computation will be performed separately on each block pair.
#' This doesn't put a precise cap on the amount of memory used (it used to
#' at some point). Set this parameter to lower values if you run out of memory
#' while running this function. Set it to higher values if this function is
#' too slow and you have lots of memory.
#' @param maxmiss Discard SNPs which are missing in a fraction of populations
#' higher than `maxmiss`
#' @param minmaf Discard SNPs with minor allele frequency less than `minmaf`
#' @param maxmaf Discard SNPs with minor allele frequency greater than
#' than `maxmaf`
#' @param minac2 Discard SNPs with allele count lower than 2 in any population
#' (default `FALSE`). This option should be set to `TRUE` when computing
#' f3-statistics where one population consists mostly of pseudohaploid samples.
#' Otherwise heterozygosity estimates and thus f3-estimates can be biased.
#' `minac2 == 2` will discard SNPs with allele count lower than 2 in any
#' non-singleton population (this option is experimental and is based on the
#' hypothesis that using SNPs with allele count lower than 2 only leads to
#' biases in non-singleton populations). Note that, While the `minac2` option
#' discards
#' SNPs with allele count lower than 2 in any population, the \code{qp3pop}
#' function will only discard SNPs with allele count lower than 2 in the
#' first (target) population (when the first argument is the prefix of
#' a genotype file; i.e. it is applied directly to a genotype file, not via
#' precomputing f2 from a [`gen_tibble`]).
#' @param outpop Keep only SNPs which are heterozygous in this population
#' @param outpop_scale Scale f2-statistics by the inverse `outpop`
#' heteroygosity (`1/(p*(1-p))`). Providing `outpop` and setting
#' `outpop_scale` to `TRUE` will give the same results as the original
#' *qpGraph* when the `outpop` parameter has been set, but it has the
#' disadvantage of treating one population different from the others. This
#' may limit the use of these f2-statistics for other models.
#' @param transitions Set this to `FALSE` to exclude transition SNPs
#' @param transversions Set this to `FALSE` to exclude transversion SNPs
#' @param overwrite Overwrite existing files in `outdir`
#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are
#' usually coded as `0` or `2`, even though only one allele is observed.
#' `adjust_pseudohaploid` ensures that the observed allele count increases
#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples
#' that don't have any genotypes coded as `1` among the first 1000 SNPs are
#' automatically identified as pseudohaploid. This leads to slightly more
#' accurate estimates of f-statistics. Setting this parameter to `FALSE`
#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* `
#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n`
#' will check the first `n` SNPs instead of the first 1000 SNPs.
#' @param fst Write files with pairwise FST for every population pair.
#' Setting this to FALSE can make `extract_f2` faster and will require less memory.
#' @param afprod Write files with allele frequency products for every
#' population pair. Setting this to FALSE can make `extract_f2` faster and
#' will require less memory.
#' @param poly_only Specify whether SNPs with identical allele frequencies in
#' every population should be discarded (`poly_only = TRUE`), or whether they
#' should be used (`poly_only = FALSE`). By default (`poly_only = c("f2")`),
#' these SNPs will be used to compute FST and allele frequency products, but
#' not to compute f2 (this is the default option in the original ADMIXTOOLS).
#' @param apply_corr Apply small-sample-size correction when computing
#' f2-statistics (default `TRUE`)
#' @param n_cores Parallelize computation across `n_cores` cores.
#' @param quiet Suppress printing of progress updates
#' @return SNP metadata (invisibly)
#' @export

gt_extract_f2<- function(.x , outdir=NULL, blgsize = 0.05, maxmem = 8000,
maxmiss = 0, minmaf = 0, maxmaf = 0.5, minac2 = FALSE, outpop = NULL, outpop_scale = TRUE,
transitions = TRUE, transversions = TRUE,
overwrite = FALSE,
adjust_pseudohaploid = TRUE, fst = TRUE, afprod = TRUE,
poly_only = c('f2'), apply_corr = TRUE, n_cores = 1, quiet = FALSE) {
if (!requireNamespace("admixtools", quietly = TRUE)) {
stop(
"to use this function, first install package 'admixtools' with\n",
"devtools::install_github('uqrmaie1/admixtools')")
}

# parameters that don't make sense with a gen_tibble
inds = NULL
pops = NULL
pops2 = NULL
auto_only = FALSE
keepsnps = NULL

if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
}
# if no outdir is given, create a subdirectory f2 in the path of the gen_tibble rds
if (is.null(outdir)){
outdir <- file.path(dirname(.gt_get_bigsnp(.x)$genotypes$rds),"f2")
}

verbose <- !quiet
afdat = gt_to_aftable(.x,
adjust_pseudohaploid = adjust_pseudohaploid,
n_cores = n_cores)

if(is.null(inds)) pops = union(pops, pops2)

afdat %<>% discard_from_aftable(maxmiss = maxmiss, minmaf = minmaf, maxmaf = maxmaf, minac2 = minac2, outpop = outpop,
transitions = transitions, transversions = transversions,
keepsnps = keepsnps, auto_only = auto_only, poly_only = FALSE)
afdat$snpfile %<>% mutate(poly = as.logical(cpp_is_polymorphic(afdat$afs)))
if(sum(afdat$snpfile$poly) == 0) stop('There are no informative SNPs!')

if(verbose) message(paste0(nrow(afdat$afs), ' SNPs remain after filtering. ',
sum(afdat$snpfile$poly),' are polymorphic.\n'))

if(isTRUE(poly_only)) poly_only = c('f2', 'ap', 'fst')
arrs = afs_to_f2_blocks(afdat, outdir = outdir, overwrite = overwrite, maxmem = maxmem, poly_only = poly_only,
pops1 = pops, pops2 = pops2, outpop = if(outpop_scale) outpop else NULL,
blgsize = blgsize, afprod = afprod, fst = fst, apply_corr = apply_corr,
n_cores = n_cores, verbose = verbose)

if(is.null(outdir)) return(arrs)

if(verbose) message(paste0('Data written to ', outdir, '/\n'))
invisible(afdat$snpfile)
}


#' Create an allele frequency table (aftable) for admixtools
#'
#' This function is equivalent to `anygeno_to_aftable()`, but the aftable is
#' created directly from the `gen_tibble`.
#' @param .x the [`gen_tibble`]
#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are
#' usually coded as `0` or `2`, even though only one allele is observed.
#' `adjust_pseudohaploid` ensures that the observed allele count increases
#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples
#' that don't have any genotypes coded as `1` among the first 1000 SNPs are
#' automatically identified as pseudohaploid. This leads to slightly more
#' accurate estimates of f-statistics. Setting this parameter to `FALSE`
#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* `
#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n`
#' will check the first `n` SNPs instead of the first 1000 SNPs.
#' @param n_cores Parallelize computation across `n_cores` cores.
#' @returns a list of 3 elements: afs, counts, and snp
#' @keywords internal

gt_to_aftable <- function(.x, adjust_pseudohaploid= TRUE, n_cores = bigstatsr::nb_cores()){
if (!inherits(.x,"grouped_df")){
stop (".x should be a grouped df")
}
geno_fbm <- .gt_get_bigsnp(.x)$genotypes

# TODO we need a version with mixed ploidy that allows to account for haploids (or pseudohaploids)
if (adjust_pseudohaploid){
if (is.numeric(adjust_pseudohaploid)){
n_test <- adjust_pseudohaploid
} else {
n_test <- 1000
}
ploidy <- identify_pseudohaploids(.x, n_test)
aftable <- gt_grouped_alt_freq_pseudohap(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x),
colInd = .gt_bigsnp_cols(.x),
groupIds = dplyr::group_indices(.x)-1,
ngroups = max(dplyr::group_indices(.x)),
ploidy = ploidy,
ncores = n_cores)

} else {
aftable <- gt_grouped_alt_freq_diploid(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x),
colInd = .gt_bigsnp_cols(.x),
groupIds = dplyr::group_indices(.x)-1,
ngroups = max(dplyr::group_indices(.x)),
ncores = n_cores)

}
names(aftable)<-c("afs","counts")

.group_levels = .x %>% group_keys() %>% pull(1)
dimnames(aftable$afs)<-list(loci_names(.x),.group_levels)
dimnames(aftable$counts)<-list(loci_names(.x),.group_levels)

loci_new_names <- c(SNP = "name", CHR = "chromosome", POS="position",cm="genetic_dist",
A1 = "allele_alt",A2 = "allele_ref")
snp <- show_loci(.x) %>% select(-all_of("big_index")) %>%
rename(dplyr::all_of(loci_new_names)) %>%
dplyr::relocate(all_of(c("CHR", "SNP", "cm", "POS", "A1", "A2")))
snp$A1[is.na(snp$A1)]<-"0"
class(snp)<-c("spec_tbl_df",class(snp))
# we are missing column specs, which are generated by readr::col_spec
# do we need to make sure that CHR and SNP are character?
snp$CHR <- as.character(snp$CHR)
snp$SNP <- as.character(snp$SNP)

aftable$snpfile <- snp
return(aftable)
}




# import some functions that are not exported by admixtools
discard_from_aftable <- function(...){
utils::getFromNamespace("discard_from_aftable", "admixtools")(...)}

afs_to_f2_blocks <- function(...){
utils::getFromNamespace("afs_to_f2_blocks", "admixtools")(...)}

cpp_is_polymorphic <- function(...) {
utils::getFromNamespace("cpp_is_polymorphic", "admixtools")(...)}

#' Identify pseudohaploids
#'
#' Pseudohaploids are coded as all homozygotes; we find them by checking the
#' first 'n_test' loci and call a pseudohaploid if they have zero heterozygosity
#' (this is the same strategy employed in admixtools)
#' @param x the gen_tibble
#' @param n_test the number of loci being tested
#' @return a numeric vector of ploidy
#' @keywords internal
identify_pseudohaploids <- function (x, n_test = 1000){
if (n_test>count_loci(x)){
n_test <- count_loci(x)
}
sub_x <- select_loci(x,.sel_arg = dplyr::all_of(c(1:n_test))) %>%
dplyr::ungroup()
het_obs <- indiv_het_obs(sub_x)
ploidy <-rep(2,nrow(x))
ploidy[het_obs==0]<-1
return(ploidy)
}
Loading

0 comments on commit f531d94

Please sign in to comment.