Merge pull request #39 from EvolEcolGroup/admixtools

Admixtools
EvolEcolGroup · May 17, 2024 · f531d94 · f531d94
2 parents 7a4bdf0 + 57bf196
commit f531d94
Show file tree

Hide file tree

Showing 38 changed files with 1,227 additions and 172 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidypopgen
 Title: Tidy Population Genetics
-Version: 0.0.0.9011
+Version: 0.0.0.9012
 Authors@R: 
     c(person("Andrea", "Manica", email = "[email protected]", 
            role = c("aut", "cre"),
@@ -37,6 +37,7 @@ Imports:
     vctrs
 Suggests:
     adegenet,
+    admixtools,
     broom,
     hierfstat,
     knitr,
@@ -46,6 +47,8 @@ Suggests:
     readr,
     testthat (>= 3.0.0),
     vcfR
+Remotes:
+    uqrmaie1/admixtools
 VignetteBuilder: knitr
 Config/testthat/edition: 3
 LinkingTo: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -45,9 +45,6 @@ S3method(loci_missingness,tbl_df)
 S3method(loci_missingness,vctrs_bigSNP)
 S3method(loci_names,tbl_df)
 S3method(loci_names,vctrs_bigSNP)
-S3method(loci_sums,grouped_df)
-S3method(loci_sums,tbl_df)
-S3method(loci_sums,vctrs_bigSNP)
 S3method(loci_transitions,grouped_df)
 S3method(loci_transitions,tbl_df)
 S3method(loci_transitions,vctrs_bigSNP)
@@ -85,6 +82,7 @@ export(gt_as_plink)
 export(gt_cluster_pca)
 export(gt_cluster_pca_best_k)
 export(gt_dapc)
+export(gt_extract_f2)
 export(gt_get_file_names)
 export(gt_has_imputed)
 export(gt_impute_simple)
@@ -107,7 +105,6 @@ export(loci_ld_clump)
 export(loci_maf)
 export(loci_missingness)
 export(loci_names)
-export(loci_sums)
 export(loci_transitions)
 export(loci_transversions)
 export(pairwise_allele_sharing)
@@ -133,6 +130,7 @@ importFrom(Rcpp,sourceCpp)
 importFrom(generics,augment)
 importFrom(generics,tidy)
 importFrom(ggplot2,autoplot)
+importFrom(magrittr,"%<>%")
 importFrom(magrittr,"%>%")
 importFrom(rlang,":=")
 useDynLib(tidypopgen, .registration = TRUE)
diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,6 +1,22 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+gt_grouped_alt_freq_diploid <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
+    .Call(`_tidypopgen_gt_grouped_alt_freq_diploid`, BM, rowInd, colInd, groupIds, ngroups, ncores)
+}
+
+gt_grouped_alt_freq_pseudohap <- function(BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores) {
+    .Call(`_tidypopgen_gt_grouped_alt_freq_pseudohap`, BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores)
+}
+
+gt_grouped_missingness <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
+    .Call(`_tidypopgen_gt_grouped_missingness`, BM, rowInd, colInd, groupIds, ngroups, ncores)
+}
+
+gt_grouped_summaries <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) {
+    .Call(`_tidypopgen_gt_grouped_summaries`, BM, rowInd, colInd, groupIds, ngroups, ncores)
+}
+
 SNPHWE2 <- function(obs_hets, obs_hom1, obs_hom2, midp) {
     .Call(`_tidypopgen_SNPHWE2`, obs_hets, obs_hom1, obs_hom2, midp)
 }

diff --git a/R/gen_tibble.R b/R/gen_tibble.R
@@ -3,14 +3,17 @@
 #' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
 #' here the format
 #' @param x can be:
-#' - a string giving the path to a PLINK BED or PED file. The correspective
-#' BIM and FAM fiel for the BED, or MAP for PED are expected to be in the same
+#' - a string giving the path to a PLINK BED or PED file. The associated
+#' BIM and FAM files for the BED, or MAP for PED are expected to be in the same
 #' directory and have the same file name.
 #' - a string giving the path to a RDS file storing a `bigSNP` object from
 #' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
 #' - a string giving the path to a vcf file. Note that we currently read the whole
 #' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic
 #' SNPs will be considered.
+#' - a string giving the path to a packedancestry .geno file. The associated
+#' .ind and .snp files are expected to be in the same directory and share the
+#' same file name prefix.
 #' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
 #' allele.
 #' @param indiv_meta a list, data.frame or tibble with compulsory columns 'id'
@@ -91,6 +94,12 @@ gen_tibble.character <-
                        missing_alleles= missing_alleles,
                        backingfile = backingfile,
                        quiet = quiet)
+  } else if (tolower(file_ext(x))=="geno"){
+    gen_tibble_packedancestry(x = x, ...,
+                   valid_alleles= valid_alleles,
+                   missing_alleles= missing_alleles,
+                   backingfile = backingfile,
+                   quiet = quiet)
   } else  {
     stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
   }

diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R
@@ -0,0 +1,41 @@
+#A function to read geno packedancestrymap files
+gen_tibble_packedancestry <- function(x, ...,
+                            valid_alleles = c("A", "T", "C", "G"),
+                            missing_alleles = c("0","."),
+                            backingfile = NULL, quiet = FALSE) {
+  # Substitute .ped with .map
+  map_file <- sub("\\.geno$", ".snp", x)
+  if (!file.exists(map_file)){
+    stop("snp file ",map_file," does not exist")
+  }
+  ind_file <- sub("\\.geno$", ".ind", x)
+  if (!file.exists(ind_file)){
+    stop("ind file ",ind_file," does not exist")
+  }
+
+  if (is.null(backingfile)){
+    backingfile <- sub("\\.geno$", "", x)
+  }
+  if (file_ext(backingfile)=="bk"){
+    backingfile <- bigstatsr::sub_bk(backingfile,"")
+  }
+
+  res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x),
+                                                  transpose = TRUE,
+                                            ...)
+  names(res$ind)<-c("id","sex","population")
+  #TODO check that allele_ref and allele_alt are not swapped
+  names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')
+
+  # using the gen_tibble.matrix method
+  new_gen_tbl <- gen_tibble(x = res$geno,
+                            indiv_meta = res$ind,
+                            loci = res$snp,
+                            backingfile = backingfile,
+                            quiet=quiet,
+                            valid_alleles = valid_alleles,
+                            missing_alleles = missing_alleles)
+
+  return(new_gen_tbl)
+
+}
diff --git a/R/gt_extract_f2.R b/R/gt_extract_f2.R
@@ -0,0 +1,235 @@
+#' Compute and store blocked f2 statistics for ADMIXTOOLS 2
+#'
+#' This function prepares data for various *ADMIXTOOLS 2* functions fro the
+#' package *ADMIXTOOLS 2*. It
+#' takes a [`gen_tibble`],
+#' computes allele frequencies and blocked f2-statistics,
+#' and writes the results to `outdir`. It is equivalent to
+#' `admixtools::extract_f2()`.
+#' @param .x a [`gen_tibble`]
+#' @param outdir Directory where data will be stored.
+#' @param blgsize SNP block size in Morgan. Default is 0.05 (5 cM). If `blgsize`
+#'  is 100 or greater, if will be interpreted as base pair distance rather than
+#'  centimorgan distance.
+#' @param maxmem Maximum amount of memory to be used. If the required amount
+#' of memory exceeds `maxmem`, allele frequency data will be split into blocks,
+#'  and the computation will be performed separately on each block pair.
+#'  This doesn't put a precise cap on the amount of memory used (it used to
+#'  at some point). Set this parameter to lower values if you run out of memory
+#'  while running this function. Set it to higher values if this function is
+#'  too slow and you have lots of memory.
+#' @param maxmiss Discard SNPs which are missing in a fraction of populations
+#' higher than `maxmiss`
+#' @param minmaf Discard SNPs with minor allele frequency less than `minmaf`
+#' @param maxmaf Discard SNPs with minor allele frequency greater than
+#' than `maxmaf`
+#' @param minac2 Discard SNPs with allele count lower than 2 in any population
+#' (default `FALSE`). This option should be set to `TRUE` when computing
+#' f3-statistics where one population consists mostly of pseudohaploid samples.
+#' Otherwise heterozygosity estimates and thus f3-estimates can be biased.
+#' `minac2 == 2` will discard SNPs with allele count lower than 2 in any
+#' non-singleton population (this option is experimental and is based on the
+#' hypothesis that using SNPs with allele count lower than 2 only leads to
+#' biases in non-singleton populations). Note that, While the `minac2` option
+#' discards
+#' SNPs with allele count lower than 2 in any population, the \code{qp3pop}
+#' function will only discard SNPs with allele count lower than 2 in the
+#' first (target) population (when the first argument is the prefix of
+#' a genotype file; i.e. it is applied directly to a genotype file, not via
+#' precomputing f2 from a [`gen_tibble`]).
+#' @param outpop Keep only SNPs which are heterozygous in this population
+#' @param outpop_scale Scale f2-statistics by the inverse `outpop`
+#' heteroygosity (`1/(p*(1-p))`). Providing `outpop` and setting
+#' `outpop_scale` to `TRUE` will give the same results as the original
+#' *qpGraph* when the `outpop` parameter has been set, but it has the
+#' disadvantage of treating one population different from the others. This
+#' may limit the use of these f2-statistics for other models.
+#' @param transitions Set this to `FALSE` to exclude transition SNPs
+#' @param transversions Set this to `FALSE` to exclude transversion SNPs
+#' @param overwrite Overwrite existing files in `outdir`
+#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are
+#' usually coded as `0` or `2`, even though only one allele is observed.
+#' `adjust_pseudohaploid` ensures that the observed allele count increases
+#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples
+#' that don't have any genotypes coded as `1` among the first 1000 SNPs are
+#' automatically identified as pseudohaploid. This leads to slightly more
+#' accurate estimates of f-statistics. Setting this parameter to `FALSE`
+#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* `
+#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n`
+#' will check the first `n` SNPs instead of the first 1000 SNPs.
+#' @param fst Write files with pairwise FST for every population pair.
+#' Setting this to FALSE can make `extract_f2` faster and will require less memory.
+#' @param afprod  Write files with allele frequency products for every
+#' population pair. Setting this to FALSE can make `extract_f2` faster and
+#' will require less memory.
+#' @param poly_only Specify whether SNPs with identical allele frequencies in
+#' every population should be discarded (`poly_only = TRUE`), or whether they
+#' should be used (`poly_only = FALSE`). By default (`poly_only = c("f2")`),
+#' these SNPs will be used to compute FST and allele frequency products, but
+#' not to compute f2 (this is the default option in the original ADMIXTOOLS).
+#' @param apply_corr Apply small-sample-size correction when computing
+#' f2-statistics (default `TRUE`)
+#' @param n_cores Parallelize computation across `n_cores` cores.
+#' @param quiet Suppress printing of progress updates
+#' @return SNP metadata (invisibly)
+#' @export
+
+gt_extract_f2<- function(.x , outdir=NULL, blgsize = 0.05, maxmem = 8000,
+                         maxmiss = 0, minmaf = 0, maxmaf = 0.5, minac2 = FALSE, outpop = NULL, outpop_scale = TRUE,
+                         transitions = TRUE, transversions = TRUE,
+                          overwrite = FALSE,
+                         adjust_pseudohaploid = TRUE, fst = TRUE, afprod = TRUE,
+                         poly_only = c('f2'), apply_corr = TRUE, n_cores = 1, quiet = FALSE) {
+  if (!requireNamespace("admixtools", quietly = TRUE)) {
+    stop(
+      "to use this function, first install package 'admixtools' with\n",
+      "devtools::install_github('uqrmaie1/admixtools')")
+  }
+
+  # parameters that don't make sense with a gen_tibble
+  inds = NULL
+  pops = NULL
+  pops2 = NULL
+  auto_only = FALSE
+  keepsnps = NULL
+
+  if (!inherits(.x,"grouped_df")){
+    stop (".x should be a grouped df")
+  }
+  # if no outdir is given, create a subdirectory f2 in the path of the gen_tibble rds
+  if (is.null(outdir)){
+    outdir <- file.path(dirname(.gt_get_bigsnp(.x)$genotypes$rds),"f2")
+  }
+
+  verbose <- !quiet
+  afdat = gt_to_aftable(.x,
+                        adjust_pseudohaploid = adjust_pseudohaploid,
+                        n_cores = n_cores)
+
+  if(is.null(inds)) pops = union(pops, pops2)
+
+  afdat %<>% discard_from_aftable(maxmiss = maxmiss, minmaf = minmaf, maxmaf = maxmaf, minac2 = minac2, outpop = outpop,
+                                               transitions = transitions, transversions = transversions,
+                                               keepsnps = keepsnps, auto_only = auto_only, poly_only = FALSE)
+  afdat$snpfile %<>% mutate(poly = as.logical(cpp_is_polymorphic(afdat$afs)))
+  if(sum(afdat$snpfile$poly) == 0) stop('There are no informative SNPs!')
+
+  if(verbose) message(paste0(nrow(afdat$afs), ' SNPs remain after filtering. ',
+                             sum(afdat$snpfile$poly),' are polymorphic.\n'))
+
+  if(isTRUE(poly_only)) poly_only = c('f2', 'ap', 'fst')
+  arrs = afs_to_f2_blocks(afdat, outdir = outdir, overwrite = overwrite, maxmem = maxmem, poly_only = poly_only,
+                                       pops1 = pops, pops2 = pops2, outpop = if(outpop_scale) outpop else NULL,
+                                       blgsize = blgsize, afprod = afprod, fst = fst, apply_corr = apply_corr,
+                                       n_cores = n_cores, verbose = verbose)
+
+  if(is.null(outdir)) return(arrs)
+
+  if(verbose) message(paste0('Data written to ', outdir, '/\n'))
+  invisible(afdat$snpfile)
+}
+
+
+#' Create an allele frequency table (aftable) for admixtools
+#'
+#' This function is equivalent to `anygeno_to_aftable()`, but the aftable is
+#' created directly from the `gen_tibble`.
+#' @param .x the [`gen_tibble`]
+#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are
+#' usually coded as `0` or `2`, even though only one allele is observed.
+#' `adjust_pseudohaploid` ensures that the observed allele count increases
+#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples
+#' that don't have any genotypes coded as `1` among the first 1000 SNPs are
+#' automatically identified as pseudohaploid. This leads to slightly more
+#' accurate estimates of f-statistics. Setting this parameter to `FALSE`
+#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* `
+#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n`
+#' will check the first `n` SNPs instead of the first 1000 SNPs.
+#' @param n_cores Parallelize computation across `n_cores` cores.
+#' @returns a list of 3 elements: afs, counts, and snp
+#' @keywords internal
+
+gt_to_aftable <- function(.x, adjust_pseudohaploid= TRUE, n_cores = bigstatsr::nb_cores()){
+  if (!inherits(.x,"grouped_df")){
+    stop (".x should be a grouped df")
+  }
+  geno_fbm <- .gt_get_bigsnp(.x)$genotypes
+
+  # TODO we need a version with mixed ploidy that allows to account for haploids (or pseudohaploids)
+  if (adjust_pseudohaploid){
+    if (is.numeric(adjust_pseudohaploid)){
+      n_test <- adjust_pseudohaploid
+    } else {
+      n_test <- 1000
+    }
+    ploidy <- identify_pseudohaploids(.x, n_test)
+    aftable <- gt_grouped_alt_freq_pseudohap(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x),
+                                           colInd = .gt_bigsnp_cols(.x),
+                                           groupIds = dplyr::group_indices(.x)-1,
+                                           ngroups = max(dplyr::group_indices(.x)),
+                                           ploidy = ploidy,
+                                           ncores = n_cores)
+
+  } else {
+    aftable <- gt_grouped_alt_freq_diploid(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x),
+                                           colInd = .gt_bigsnp_cols(.x),
+                                           groupIds = dplyr::group_indices(.x)-1,
+                                           ngroups = max(dplyr::group_indices(.x)),
+                                           ncores = n_cores)
+
+  }
+  names(aftable)<-c("afs","counts")
+
+  .group_levels = .x %>% group_keys() %>% pull(1)
+  dimnames(aftable$afs)<-list(loci_names(.x),.group_levels)
+  dimnames(aftable$counts)<-list(loci_names(.x),.group_levels)
+
+  loci_new_names <- c(SNP = "name", CHR = "chromosome", POS="position",cm="genetic_dist",
+                      A1 = "allele_alt",A2 = "allele_ref")
+  snp <- show_loci(.x) %>% select(-all_of("big_index")) %>%
+    rename(dplyr::all_of(loci_new_names)) %>%
+    dplyr::relocate(all_of(c("CHR", "SNP", "cm", "POS", "A1", "A2")))
+  snp$A1[is.na(snp$A1)]<-"0"
+  class(snp)<-c("spec_tbl_df",class(snp))
+  # we are missing column specs, which are generated by readr::col_spec
+  # do we need to make sure that CHR and SNP are character?
+  snp$CHR <- as.character(snp$CHR)
+  snp$SNP <- as.character(snp$SNP)
+
+  aftable$snpfile <- snp
+  return(aftable)
+}
+
+
+
+
+# import some functions that are not exported by admixtools
+discard_from_aftable <- function(...){
+  utils::getFromNamespace("discard_from_aftable", "admixtools")(...)}
+
+afs_to_f2_blocks <- function(...){
+  utils::getFromNamespace("afs_to_f2_blocks", "admixtools")(...)}
+
+cpp_is_polymorphic <- function(...) {
+  utils::getFromNamespace("cpp_is_polymorphic", "admixtools")(...)}
+
+#' Identify pseudohaploids
+#'
+#' Pseudohaploids are coded as all homozygotes; we find them by checking the
+#' first 'n_test' loci and call a pseudohaploid if they have zero heterozygosity
+#' (this is the same strategy employed in admixtools)
+#' @param x the gen_tibble
+#' @param n_test the number of loci being tested
+#' @return a numeric vector of ploidy
+#' @keywords internal
+identify_pseudohaploids <- function (x, n_test = 1000){
+  if (n_test>count_loci(x)){
+    n_test <- count_loci(x)
+  }
+  sub_x <- select_loci(x,.sel_arg = dplyr::all_of(c(1:n_test))) %>%
+    dplyr::ungroup()
+  het_obs <- indiv_het_obs(sub_x)
+  ploidy <-rep(2,nrow(x))
+  ploidy[het_obs==0]<-1
+  return(ploidy)
+}