diff --git a/DESCRIPTION b/DESCRIPTION index 8f31f291..14eb5403 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidypopgen Title: Tidy Population Genetics -Version: 0.0.0.9011 +Version: 0.0.0.9012 Authors@R: c(person("Andrea", "Manica", email = "am315@cam.ac.uk", role = c("aut", "cre"), @@ -37,6 +37,7 @@ Imports: vctrs Suggests: adegenet, + admixtools, broom, hierfstat, knitr, @@ -46,6 +47,8 @@ Suggests: readr, testthat (>= 3.0.0), vcfR +Remotes: + uqrmaie1/admixtools VignetteBuilder: knitr Config/testthat/edition: 3 LinkingTo: diff --git a/NAMESPACE b/NAMESPACE index c016254d..f8c72123 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,9 +45,6 @@ S3method(loci_missingness,tbl_df) S3method(loci_missingness,vctrs_bigSNP) S3method(loci_names,tbl_df) S3method(loci_names,vctrs_bigSNP) -S3method(loci_sums,grouped_df) -S3method(loci_sums,tbl_df) -S3method(loci_sums,vctrs_bigSNP) S3method(loci_transitions,grouped_df) S3method(loci_transitions,tbl_df) S3method(loci_transitions,vctrs_bigSNP) @@ -85,6 +82,7 @@ export(gt_as_plink) export(gt_cluster_pca) export(gt_cluster_pca_best_k) export(gt_dapc) +export(gt_extract_f2) export(gt_get_file_names) export(gt_has_imputed) export(gt_impute_simple) @@ -107,7 +105,6 @@ export(loci_ld_clump) export(loci_maf) export(loci_missingness) export(loci_names) -export(loci_sums) export(loci_transitions) export(loci_transversions) export(pairwise_allele_sharing) @@ -133,6 +130,7 @@ importFrom(Rcpp,sourceCpp) importFrom(generics,augment) importFrom(generics,tidy) importFrom(ggplot2,autoplot) +importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(rlang,":=") useDynLib(tidypopgen, .registration = TRUE) diff --git a/R/RcppExports.R b/R/RcppExports.R index cb4f1f51..57e983d5 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,22 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +gt_grouped_alt_freq_diploid <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) { + .Call(`_tidypopgen_gt_grouped_alt_freq_diploid`, BM, rowInd, colInd, groupIds, ngroups, ncores) +} + +gt_grouped_alt_freq_pseudohap <- function(BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores) { + .Call(`_tidypopgen_gt_grouped_alt_freq_pseudohap`, BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores) +} + +gt_grouped_missingness <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) { + .Call(`_tidypopgen_gt_grouped_missingness`, BM, rowInd, colInd, groupIds, ngroups, ncores) +} + +gt_grouped_summaries <- function(BM, rowInd, colInd, groupIds, ngroups, ncores) { + .Call(`_tidypopgen_gt_grouped_summaries`, BM, rowInd, colInd, groupIds, ngroups, ncores) +} + SNPHWE2 <- function(obs_hets, obs_hom1, obs_hom2, midp) { .Call(`_tidypopgen_SNPHWE2`, obs_hets, obs_hom1, obs_hom2, midp) } diff --git a/R/gen_tibble.R b/R/gen_tibble.R index cfff22a2..2a3b03f2 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -3,14 +3,17 @@ #' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE #' here the format #' @param x can be: -#' - a string giving the path to a PLINK BED or PED file. The correspective -#' BIM and FAM fiel for the BED, or MAP for PED are expected to be in the same +#' - a string giving the path to a PLINK BED or PED file. The associated +#' BIM and FAM files for the BED, or MAP for PED are expected to be in the same #' directory and have the same file name. #' - a string giving the path to a RDS file storing a `bigSNP` object from #' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()]) #' - a string giving the path to a vcf file. Note that we currently read the whole #' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic #' SNPs will be considered. +#' - a string giving the path to a packedancestry .geno file. The associated +#' .ind and .snp files are expected to be in the same directory and share the +#' same file name prefix. #' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate #' allele. #' @param indiv_meta a list, data.frame or tibble with compulsory columns 'id' @@ -91,6 +94,12 @@ gen_tibble.character <- missing_alleles= missing_alleles, backingfile = backingfile, quiet = quiet) + } else if (tolower(file_ext(x))=="geno"){ + gen_tibble_packedancestry(x = x, ..., + valid_alleles= valid_alleles, + missing_alleles= missing_alleles, + backingfile = backingfile, + quiet = quiet) } else { stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file") } diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R new file mode 100644 index 00000000..b3dbd0ef --- /dev/null +++ b/R/gen_tibble_packedancestry.R @@ -0,0 +1,41 @@ +#A function to read geno packedancestrymap files +gen_tibble_packedancestry <- function(x, ..., + valid_alleles = c("A", "T", "C", "G"), + missing_alleles = c("0","."), + backingfile = NULL, quiet = FALSE) { + # Substitute .ped with .map + map_file <- sub("\\.geno$", ".snp", x) + if (!file.exists(map_file)){ + stop("snp file ",map_file," does not exist") + } + ind_file <- sub("\\.geno$", ".ind", x) + if (!file.exists(ind_file)){ + stop("ind file ",ind_file," does not exist") + } + + if (is.null(backingfile)){ + backingfile <- sub("\\.geno$", "", x) + } + if (file_ext(backingfile)=="bk"){ + backingfile <- bigstatsr::sub_bk(backingfile,"") + } + + res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x), + transpose = TRUE, + ...) + names(res$ind)<-c("id","sex","population") + #TODO check that allele_ref and allele_alt are not swapped + names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt') + + # using the gen_tibble.matrix method + new_gen_tbl <- gen_tibble(x = res$geno, + indiv_meta = res$ind, + loci = res$snp, + backingfile = backingfile, + quiet=quiet, + valid_alleles = valid_alleles, + missing_alleles = missing_alleles) + + return(new_gen_tbl) + +} diff --git a/R/gt_extract_f2.R b/R/gt_extract_f2.R new file mode 100644 index 00000000..b1914e6e --- /dev/null +++ b/R/gt_extract_f2.R @@ -0,0 +1,235 @@ +#' Compute and store blocked f2 statistics for ADMIXTOOLS 2 +#' +#' This function prepares data for various *ADMIXTOOLS 2* functions fro the +#' package *ADMIXTOOLS 2*. It +#' takes a [`gen_tibble`], +#' computes allele frequencies and blocked f2-statistics, +#' and writes the results to `outdir`. It is equivalent to +#' `admixtools::extract_f2()`. +#' @param .x a [`gen_tibble`] +#' @param outdir Directory where data will be stored. +#' @param blgsize SNP block size in Morgan. Default is 0.05 (5 cM). If `blgsize` +#' is 100 or greater, if will be interpreted as base pair distance rather than +#' centimorgan distance. +#' @param maxmem Maximum amount of memory to be used. If the required amount +#' of memory exceeds `maxmem`, allele frequency data will be split into blocks, +#' and the computation will be performed separately on each block pair. +#' This doesn't put a precise cap on the amount of memory used (it used to +#' at some point). Set this parameter to lower values if you run out of memory +#' while running this function. Set it to higher values if this function is +#' too slow and you have lots of memory. +#' @param maxmiss Discard SNPs which are missing in a fraction of populations +#' higher than `maxmiss` +#' @param minmaf Discard SNPs with minor allele frequency less than `minmaf` +#' @param maxmaf Discard SNPs with minor allele frequency greater than +#' than `maxmaf` +#' @param minac2 Discard SNPs with allele count lower than 2 in any population +#' (default `FALSE`). This option should be set to `TRUE` when computing +#' f3-statistics where one population consists mostly of pseudohaploid samples. +#' Otherwise heterozygosity estimates and thus f3-estimates can be biased. +#' `minac2 == 2` will discard SNPs with allele count lower than 2 in any +#' non-singleton population (this option is experimental and is based on the +#' hypothesis that using SNPs with allele count lower than 2 only leads to +#' biases in non-singleton populations). Note that, While the `minac2` option +#' discards +#' SNPs with allele count lower than 2 in any population, the \code{qp3pop} +#' function will only discard SNPs with allele count lower than 2 in the +#' first (target) population (when the first argument is the prefix of +#' a genotype file; i.e. it is applied directly to a genotype file, not via +#' precomputing f2 from a [`gen_tibble`]). +#' @param outpop Keep only SNPs which are heterozygous in this population +#' @param outpop_scale Scale f2-statistics by the inverse `outpop` +#' heteroygosity (`1/(p*(1-p))`). Providing `outpop` and setting +#' `outpop_scale` to `TRUE` will give the same results as the original +#' *qpGraph* when the `outpop` parameter has been set, but it has the +#' disadvantage of treating one population different from the others. This +#' may limit the use of these f2-statistics for other models. +#' @param transitions Set this to `FALSE` to exclude transition SNPs +#' @param transversions Set this to `FALSE` to exclude transversion SNPs +#' @param overwrite Overwrite existing files in `outdir` +#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are +#' usually coded as `0` or `2`, even though only one allele is observed. +#' `adjust_pseudohaploid` ensures that the observed allele count increases +#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples +#' that don't have any genotypes coded as `1` among the first 1000 SNPs are +#' automatically identified as pseudohaploid. This leads to slightly more +#' accurate estimates of f-statistics. Setting this parameter to `FALSE` +#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* ` +#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n` +#' will check the first `n` SNPs instead of the first 1000 SNPs. +#' @param fst Write files with pairwise FST for every population pair. +#' Setting this to FALSE can make `extract_f2` faster and will require less memory. +#' @param afprod Write files with allele frequency products for every +#' population pair. Setting this to FALSE can make `extract_f2` faster and +#' will require less memory. +#' @param poly_only Specify whether SNPs with identical allele frequencies in +#' every population should be discarded (`poly_only = TRUE`), or whether they +#' should be used (`poly_only = FALSE`). By default (`poly_only = c("f2")`), +#' these SNPs will be used to compute FST and allele frequency products, but +#' not to compute f2 (this is the default option in the original ADMIXTOOLS). +#' @param apply_corr Apply small-sample-size correction when computing +#' f2-statistics (default `TRUE`) +#' @param n_cores Parallelize computation across `n_cores` cores. +#' @param quiet Suppress printing of progress updates +#' @return SNP metadata (invisibly) +#' @export + +gt_extract_f2<- function(.x , outdir=NULL, blgsize = 0.05, maxmem = 8000, + maxmiss = 0, minmaf = 0, maxmaf = 0.5, minac2 = FALSE, outpop = NULL, outpop_scale = TRUE, + transitions = TRUE, transversions = TRUE, + overwrite = FALSE, + adjust_pseudohaploid = TRUE, fst = TRUE, afprod = TRUE, + poly_only = c('f2'), apply_corr = TRUE, n_cores = 1, quiet = FALSE) { + if (!requireNamespace("admixtools", quietly = TRUE)) { + stop( + "to use this function, first install package 'admixtools' with\n", + "devtools::install_github('uqrmaie1/admixtools')") + } + + # parameters that don't make sense with a gen_tibble + inds = NULL + pops = NULL + pops2 = NULL + auto_only = FALSE + keepsnps = NULL + + if (!inherits(.x,"grouped_df")){ + stop (".x should be a grouped df") + } + # if no outdir is given, create a subdirectory f2 in the path of the gen_tibble rds + if (is.null(outdir)){ + outdir <- file.path(dirname(.gt_get_bigsnp(.x)$genotypes$rds),"f2") + } + + verbose <- !quiet + afdat = gt_to_aftable(.x, + adjust_pseudohaploid = adjust_pseudohaploid, + n_cores = n_cores) + + if(is.null(inds)) pops = union(pops, pops2) + + afdat %<>% discard_from_aftable(maxmiss = maxmiss, minmaf = minmaf, maxmaf = maxmaf, minac2 = minac2, outpop = outpop, + transitions = transitions, transversions = transversions, + keepsnps = keepsnps, auto_only = auto_only, poly_only = FALSE) + afdat$snpfile %<>% mutate(poly = as.logical(cpp_is_polymorphic(afdat$afs))) + if(sum(afdat$snpfile$poly) == 0) stop('There are no informative SNPs!') + + if(verbose) message(paste0(nrow(afdat$afs), ' SNPs remain after filtering. ', + sum(afdat$snpfile$poly),' are polymorphic.\n')) + + if(isTRUE(poly_only)) poly_only = c('f2', 'ap', 'fst') + arrs = afs_to_f2_blocks(afdat, outdir = outdir, overwrite = overwrite, maxmem = maxmem, poly_only = poly_only, + pops1 = pops, pops2 = pops2, outpop = if(outpop_scale) outpop else NULL, + blgsize = blgsize, afprod = afprod, fst = fst, apply_corr = apply_corr, + n_cores = n_cores, verbose = verbose) + + if(is.null(outdir)) return(arrs) + + if(verbose) message(paste0('Data written to ', outdir, '/\n')) + invisible(afdat$snpfile) +} + + +#' Create an allele frequency table (aftable) for admixtools +#' +#' This function is equivalent to `anygeno_to_aftable()`, but the aftable is +#' created directly from the `gen_tibble`. +#' @param .x the [`gen_tibble`] +#' @param adjust_pseudohaploid Genotypes of pseudohaploid samples are +#' usually coded as `0` or `2`, even though only one allele is observed. +#' `adjust_pseudohaploid` ensures that the observed allele count increases +#' only by `1` for each pseudohaploid sample. If `TRUE` (default), samples +#' that don't have any genotypes coded as `1` among the first 1000 SNPs are +#' automatically identified as pseudohaploid. This leads to slightly more +#' accurate estimates of f-statistics. Setting this parameter to `FALSE` +#' treats all samples as diploid and is equivalent to the *ADMIXTOOLS* ` +#' inbreed: NO` option. Setting `adjust_pseudohaploid` to an integer `n` +#' will check the first `n` SNPs instead of the first 1000 SNPs. +#' @param n_cores Parallelize computation across `n_cores` cores. +#' @returns a list of 3 elements: afs, counts, and snp +#' @keywords internal + +gt_to_aftable <- function(.x, adjust_pseudohaploid= TRUE, n_cores = bigstatsr::nb_cores()){ + if (!inherits(.x,"grouped_df")){ + stop (".x should be a grouped df") + } + geno_fbm <- .gt_get_bigsnp(.x)$genotypes + + # TODO we need a version with mixed ploidy that allows to account for haploids (or pseudohaploids) + if (adjust_pseudohaploid){ + if (is.numeric(adjust_pseudohaploid)){ + n_test <- adjust_pseudohaploid + } else { + n_test <- 1000 + } + ploidy <- identify_pseudohaploids(.x, n_test) + aftable <- gt_grouped_alt_freq_pseudohap(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = max(dplyr::group_indices(.x)), + ploidy = ploidy, + ncores = n_cores) + + } else { + aftable <- gt_grouped_alt_freq_diploid(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = max(dplyr::group_indices(.x)), + ncores = n_cores) + + } + names(aftable)<-c("afs","counts") + + .group_levels = .x %>% group_keys() %>% pull(1) + dimnames(aftable$afs)<-list(loci_names(.x),.group_levels) + dimnames(aftable$counts)<-list(loci_names(.x),.group_levels) + + loci_new_names <- c(SNP = "name", CHR = "chromosome", POS="position",cm="genetic_dist", + A1 = "allele_alt",A2 = "allele_ref") + snp <- show_loci(.x) %>% select(-all_of("big_index")) %>% + rename(dplyr::all_of(loci_new_names)) %>% + dplyr::relocate(all_of(c("CHR", "SNP", "cm", "POS", "A1", "A2"))) + snp$A1[is.na(snp$A1)]<-"0" + class(snp)<-c("spec_tbl_df",class(snp)) + # we are missing column specs, which are generated by readr::col_spec + # do we need to make sure that CHR and SNP are character? + snp$CHR <- as.character(snp$CHR) + snp$SNP <- as.character(snp$SNP) + + aftable$snpfile <- snp + return(aftable) +} + + + + +# import some functions that are not exported by admixtools +discard_from_aftable <- function(...){ + utils::getFromNamespace("discard_from_aftable", "admixtools")(...)} + +afs_to_f2_blocks <- function(...){ + utils::getFromNamespace("afs_to_f2_blocks", "admixtools")(...)} + +cpp_is_polymorphic <- function(...) { + utils::getFromNamespace("cpp_is_polymorphic", "admixtools")(...)} + +#' Identify pseudohaploids +#' +#' Pseudohaploids are coded as all homozygotes; we find them by checking the +#' first 'n_test' loci and call a pseudohaploid if they have zero heterozygosity +#' (this is the same strategy employed in admixtools) +#' @param x the gen_tibble +#' @param n_test the number of loci being tested +#' @return a numeric vector of ploidy +#' @keywords internal +identify_pseudohaploids <- function (x, n_test = 1000){ + if (n_test>count_loci(x)){ + n_test <- count_loci(x) + } + sub_x <- select_loci(x,.sel_arg = dplyr::all_of(c(1:n_test))) %>% + dplyr::ungroup() + het_obs <- indiv_het_obs(sub_x) + ploidy <-rep(2,nrow(x)) + ploidy[het_obs==0]<-1 + return(ploidy) +} diff --git a/R/loci_alt_freq.R b/R/loci_alt_freq.R index ac40cbaf..a4028cbb 100644 --- a/R/loci_alt_freq.R +++ b/R/loci_alt_freq.R @@ -9,6 +9,7 @@ #' @param .x a vector of class `vctrs_bigSNP` (usually the `genotypes` column of #' a [`gen_tibble`] object), #' or a [`gen_tibble`]. +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] #' @param ... other arguments passed to specific methods, currently unused. #' @returns a vector of frequencies, one per locus #' @rdname loci_alt_freq @@ -22,7 +23,7 @@ loci_alt_freq <- function(.x, ...) { loci_alt_freq.tbl_df <- function(.x, ...) { #TODO this is a hack to deal with the class being dropped when going through group_map stopifnot_gen_tibble(.x) - loci_alt_freq(.x$genotypes, ...) + loci_alt_freq(.x$genotypes) } @@ -32,7 +33,7 @@ loci_alt_freq.vctrs_bigSNP <- function(.x, ...) { rlang::check_dots_empty() #stopifnot_diploid(.x) # if we have diploid - if (attr(.x,"ploidy")==2){ + if (is_diploid_only(.x)){ loci_alt_freq_diploid(.x) } else { loci_alt_freq_polyploid(.x) @@ -41,10 +42,24 @@ loci_alt_freq.vctrs_bigSNP <- function(.x, ...) { #' @export #' @rdname loci_alt_freq -loci_alt_freq.grouped_df <- function(.x, ...) { - # TODO this is seriously inefficient, we need to cast it into a big_apply problem - # of maybe it isn't that bad... - group_map(.x, .f=~loci_alt_freq(.x,, ...)) +loci_alt_freq.grouped_df <- function(.x, n_cores = bigstatsr::nb_cores(), ...) { + rlang::check_dots_empty() + if (is_diploid_only(.x)){ + geno_fbm <- .gt_get_bigsnp(.x)$genotypes + + freq_mat <- gt_grouped_alt_freq_diploid(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = max(dplyr::group_indices(.x)), + ncores = n_cores)$freq_alt + # return a list to mimic a group_map + lapply(seq_len(ncol(freq_mat)), function(i) freq_mat[,i]) + } else { + # TODO this is seriously inefficient, we should replace it with a cpp function + group_map(.x, .f=~loci_alt_freq(.x,, ...)) + } + + } #' @rdname loci_alt_freq @@ -71,10 +86,23 @@ loci_maf.vctrs_bigSNP <- function(.x, ...) { #' @export #' @rdname loci_alt_freq -loci_maf.grouped_df <- function(.x, ...) { - # TODO this is seriously inefficient, we need to cast it into a big_apply problem - # of maybe it isn't that bad... - group_map(.x, .f=~loci_maf(.x, ...)) +loci_maf.grouped_df <- function(.x, n_cores = bigstatsr::nb_cores(), ...) { + rlang::check_dots_empty() + if (is_diploid_only(.x)){ + geno_fbm <- .gt_get_bigsnp(.x)$genotypes + + freq_mat <- gt_grouped_alt_freq_diploid(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = max(dplyr::group_indices(.x)), + ncores = n_cores)$freq_alt + freq_mat[freq_mat>0.5 & !is.na(freq_mat)] <- 1 - freq_mat[freq_mat>0.5 & !is.na(freq_mat)] + # return a list to mimic a group_map + lapply(seq_len(ncol(freq_mat)), function(i) freq_mat[,i]) + } else { + # TODO this is seriously inefficient, we should replace it with a cpp function + group_map(.x, .f=~loci_maf(.x,, ...)) + } } # function to estimate frequencies for diploid @@ -95,9 +123,6 @@ loci_alt_freq_diploid <- function(.x){ } else { # if we have a single individual freq <-geno_fbm[rows_to_keep,attr(.x,"loci")$big_index] /2 } - # sort out frequencies for diploids - # @TODO get ploidy from the tibble once we have it - # @TODO for mixed ploidy, we will need a different approach freq } diff --git a/R/loci_missingness.R b/R/loci_missingness.R index e3538159..4c154a79 100644 --- a/R/loci_missingness.R +++ b/R/loci_missingness.R @@ -7,6 +7,7 @@ #' or a [`gen_tibble`]. #' @param as_counts boolean defining whether the count of NAs (rather than the rate) #' should be returned. It defaults to FALSE (i.e. rates are returned by default). +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] #' @param ... other arguments passed to specific methods. #' @returns a vector of frequencies, one per locus #' @rdname loci_missingness @@ -35,19 +36,9 @@ loci_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...) { rows_to_keep <- vctrs::vec_data(.x) # as long as we have more than one individual if (length(rows_to_keep)>1){ - # # col means for submatrix (all rows, only some columns) - # colMeans_sub <- function(X, ind, rows_to_keep) { - # count_na <- function(x){sum(is.na(x))} - # apply(X[rows_to_keep, ind], 2, count_na) - # } - # n_na <- bigstatsr::big_apply(geno_fbm, a.FUN = colMeans_sub, - # rows_to_keep = rows_to_keep, - # ind=attr(.x,"loci")$big_index, - # a.combine = 'c') - n_na <- bigstatsr::big_counts(geno_fbm, ind.row = rows_to_keep, ind.col = attr(.x,"loci")$big_index) - n_na <- n_na[nrow(n_na),] + n_na <- n_na[nrow(n_na),] # this should work also with polyploids if (!as_counts){ n_na <- n_na/length(rows_to_keep) } @@ -59,9 +50,22 @@ loci_missingness.vctrs_bigSNP <- function(.x, as_counts = FALSE, ...) { #' @export #' @rdname loci_missingness -loci_missingness.grouped_df <- function(.x, as_counts = FALSE, ...) { - # TODO this is seriously inefficient, we need to cast it into a big_apply problem - # of maybe it isn't that bad... - group_map(.x, .f=~loci_missingness(.x, as_counts=as_counts)) +loci_missingness.grouped_df <- function(.x, as_counts = FALSE, + n_cores = bigstatsr::nb_cores(), ...) { + rlang::check_dots_empty() + geno_fbm <- .gt_get_bigsnp(.x)$genotypes + + na_mat <- gt_grouped_missingness(BM = geno_fbm,rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = max(dplyr::group_indices(.x)), + ncores = n_cores) + group_sizes <- tally(.x) %>% dplyr::pull(dplyr::all_of("n")) + if (!as_counts){ + na_mat <- sweep(na_mat, MARGIN=2, STATS=group_sizes,FUN="/") + } + + # return a list to mimic a group_map + lapply(seq_len(ncol(na_mat)), function(i) na_mat[,i]) } diff --git a/R/pairwise_pop_fst.R b/R/pairwise_pop_fst.R index 46f7d14b..62838ef9 100644 --- a/R/pairwise_pop_fst.R +++ b/R/pairwise_pop_fst.R @@ -20,20 +20,22 @@ #' or as a single genome wide value obtained by taking the ratio of the mean numerator #' and denominator (FALSE, the default). #' @param method one of 'Hudson', 'Nei86', 'Nei87', and 'WC84' +#' @param n_cores number of cores to be used, it defaults to [bigstatsr::nb_cores()] #' @returns a tibble of genome-wide pairwise Fst values with each pairwise combination #' as a row if "by_locus=FALSE", else #' a list including the tibble of genome-wide values as well as a matrix with pairwise #' Fst by locus with loci as rows and and pairwise #' combinations as columns. -#' a matrix #' @export # #' @param tidy boolean whether to return a tidy tibble. Default is TRUE, FALSE -# #' returns a matrix. THIS IS NOT IMPLEMENT YET. +# #' returns a matrix. THIS IS NOT IMPLEMENTED YET. -pairwise_pop_fst <- function(.x, by_locus=FALSE, method= c("Hudson","Nei87","Nei86","WC84")){ +pairwise_pop_fst <- function(.x, by_locus=FALSE, + method= c("Hudson","Nei87","Nei86","WC84"), + n_cores = bigstatsr::nb_cores()){ if (!inherits(.x,"grouped_df")){ stop (".x should be a grouped df") } @@ -49,7 +51,7 @@ pairwise_pop_fst <- function(.x, by_locus=FALSE, method= c("Hudson","Nei87","Nei } } -pairwise_pop_fst_hudson <- function(.x, by_locus=FALSE){ +pairwise_pop_fst_hudson <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_cores()){ message("this function is not properly tested yet!!!") message("if you have time, test the results against something like hierfstat and create a unit test") @@ -63,16 +65,22 @@ pairwise_pop_fst_hudson <- function(.x, by_locus=FALSE){ Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn)) } # summarise population frequencies - pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x)) + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = nrow(.group_levels), + ncores = n_cores) + for (i_col in seq_len(ncol(pairwise_combn))){ pop1 <- pairwise_combn[1,i_col] pop2 <- pairwise_combn[2,i_col] - numerator <- (pop_freqs_df[[pop1]]$freq_alt-pop_freqs_df[[pop2]]$freq_alt)^2 - - (pop_freqs_df[[pop1]]$freq_alt*pop_freqs_df[[pop1]]$freq_ref)/ (pop_freqs_df[[pop1]]$n -1) - - (pop_freqs_df[[pop2]]$freq_alt*pop_freqs_df[[pop2]]$freq_ref)/ (pop_freqs_df[[pop2]]$n -1) - denominator <- pop_freqs_df[[pop1]]$freq_alt * pop_freqs_df[[pop2]]$freq_ref + - pop_freqs_df[[pop2]]$freq_alt * pop_freqs_df[[pop1]]$freq_ref + numerator <- (pop_freqs_df$freq_alt[,pop1]-pop_freqs_df$freq_alt[,pop2])^2 - + (pop_freqs_df$freq_alt[,pop1]*pop_freqs_df$freq_ref[,pop1])/ (pop_freqs_df$n[,pop1] -1) - + (pop_freqs_df$freq_alt[,pop2]*pop_freqs_df$freq_ref[,pop2])/ (pop_freqs_df$n[,pop2] -1) + denominator <- pop_freqs_df$freq_alt[,pop1] * pop_freqs_df$freq_ref[,pop2] + + pop_freqs_df$freq_alt[,pop2] * pop_freqs_df$freq_ref[,pop1] if (by_locus){ Fst_locus[,i_col] = numerator/denominator } @@ -96,8 +104,11 @@ pairwise_pop_fst_hudson <- function(.x, by_locus=FALSE){ -# the implementation for Nei 87, adapted from hierfstat -pairwise_pop_fst_nei87 <- function(.x, by_locus = FALSE){ +# based on the formula in Bhatia 2013 +pairwise_pop_fst_wc84 <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_cores()){ + + message("this function is not properly tested yet!!!") + message("if you have time, test the results against something like hierfstat and create a unit test") # get the populations .group_levels = .x %>% group_keys() # create all combinations @@ -108,39 +119,30 @@ pairwise_pop_fst_nei87 <- function(.x, by_locus = FALSE){ Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn)) } # summarise population frequencies - pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x)) + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = nrow(.group_levels), + ncores = n_cores) for (i_col in seq_len(ncol(pairwise_combn))){ pop1 <- pairwise_combn[1,i_col] pop2 <- pairwise_combn[2,i_col] - - n <-cbind(pop_freqs_df[[pop1]]$n,pop_freqs_df[[pop2]]$n)/2 - sHo <-cbind(pop_freqs_df[[pop1]]$het_obs,pop_freqs_df[[pop2]]$het_obs) - mHo <- apply(sHo, 1, mean, na.rm = TRUE) - freq_alt <- cbind(pop_freqs_df[[pop1]]$freq_alt, pop_freqs_df[[pop2]]$freq_alt) - freq_ref <- cbind(pop_freqs_df[[pop1]]$freq_ref, pop_freqs_df[[pop2]]$freq_ref) - - # sum of squared frequencies - sp2 <- freq_alt^2+freq_ref^2 - Hs <- (1 - sp2 - sHo/2/n) - Hs <- n/(n - 1) * Hs - np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) - # mean sample size over the populations - mn <- apply(n, 1, fun <- function(x) { - sum(!is.na(x))/sum(1/x[!is.na(x)]) - }) - # mean sum of square frequencies - msp2 <- apply(sp2, 1, mean, na.rm = TRUE) - mp2 <- rowMeans(freq_alt)^2+rowMeans(freq_ref)^2 - mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) - Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np - - Dst <- Ht - mHs - Dstp <- np/(np - 1) * Dst - Htp = mHs + Dstp + n1 <- pop_freqs_df$n[,pop1] + n2 <- pop_freqs_df$n[,pop2] + p1 <- pop_freqs_df$freq_alt[,pop1] + p2 <- pop_freqs_df$freq_alt[,pop2] + q1 <- pop_freqs_df$freq_ref[,pop1] + q2 <- pop_freqs_df$freq_ref[,pop2] + a <- (n1*n2)/(n1+n2) + b <- (1/(n1+n2-2))*((n1*p1*q1)+(n2*p2*q2)) + c <- (p1-p2)^2 + numerator <- 2*a*b + denominator <- a*c+(2*a-1)*b if (by_locus){ - Fst_locus[,i_col] = Dstp/Htp + Fst_locus[,i_col] = 1-numerator/denominator } - Fst_tot[i_col]<-mean(Dstp)/mean(Htp) + Fst_tot[i_col]<-1-mean(numerator)/mean(denominator) } # format nicely the objects group_combinations <- cbind(.group_levels[pairwise_combn[1,],],.group_levels[pairwise_combn[2,],]) @@ -157,9 +159,8 @@ pairwise_pop_fst_nei87 <- function(.x, by_locus = FALSE){ } } - # based on the formula in Bhatia 2013 -pairwise_pop_fst_wc84 <- function(.x, by_locus=FALSE){ +pairwise_pop_fst_nei86 <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_cores()){ message("this function is not properly tested yet!!!") message("if you have time, test the results against something like hierfstat and create a unit test") @@ -173,25 +174,24 @@ pairwise_pop_fst_wc84 <- function(.x, by_locus=FALSE){ Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn)) } # summarise population frequencies - pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x)) + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = nrow(.group_levels), + ncores = n_cores) for (i_col in seq_len(ncol(pairwise_combn))){ pop1 <- pairwise_combn[1,i_col] pop2 <- pairwise_combn[2,i_col] - n1 <- pop_freqs_df[[pop1]]$n - n2 <- pop_freqs_df[[pop2]]$n - p1 <- pop_freqs_df[[pop1]]$freq_alt - p2 <- pop_freqs_df[[pop2]]$freq_alt - q1 <- pop_freqs_df[[pop1]]$freq_ref - q2 <- pop_freqs_df[[pop2]]$freq_ref - a <- (n1*n2)/(n1+n2) - b <- (1/(n1+n2-2))*((n1*p1*q1)+(n2*p2*q2)) - c <- (p1-p2)^2 - numerator <- 2*a*b - denominator <- a*c+(2*a-1)*b + p1 <- pop_freqs_df$freq_alt[,pop1] + p2 <- pop_freqs_df$freq_alt[,pop2] + pbar <- (p1+p2)/2 + numerator <- ((p1-p2)^2) + denominator <- (2*pbar*(1-pbar)) if (by_locus){ - Fst_locus[,i_col] = 1-numerator/denominator + Fst_locus[,i_col] = numerator/denominator } - Fst_tot[i_col]<-1-mean(numerator)/mean(denominator) + Fst_tot[i_col]<-mean(numerator)/mean(denominator) } # format nicely the objects group_combinations <- cbind(.group_levels[pairwise_combn[1,],],.group_levels[pairwise_combn[2,],]) @@ -208,11 +208,13 @@ pairwise_pop_fst_wc84 <- function(.x, by_locus=FALSE){ } } -# based on the formula in Bhatia 2013 -pairwise_pop_fst_nei86 <- function(.x, by_locus=FALSE){ - message("this function is not properly tested yet!!!") - message("if you have time, test the results against something like hierfstat and create a unit test") + +## use tidyr::pivot_wider to turn into a matrix if that's what is requested. + +### Faster versions +# the implementation for Nei 87, adapted from hierfstat +pairwise_pop_fst_nei87 <- function(.x, by_locus = FALSE, n_cores = bigstatsr::nb_cores()){ # get the populations .group_levels = .x %>% group_keys() # create all combinations @@ -223,19 +225,44 @@ pairwise_pop_fst_nei86 <- function(.x, by_locus=FALSE){ Fst_locus <- matrix(NA_real_, nrow = count_loci(.x), ncol = ncol(pairwise_combn)) } # summarise population frequencies - pop_freqs_df <- group_map(.x, .f=~.gt_pop_freqs(.x)) + pop_freqs_df <- gt_grouped_summaries(.gt_get_bigsnp(.x)$genotypes, + rowInd = .gt_bigsnp_rows(.x), + colInd = .gt_bigsnp_cols(.x), + groupIds = dplyr::group_indices(.x)-1, + ngroups = nrow(.group_levels), + ncores = n_cores) for (i_col in seq_len(ncol(pairwise_combn))){ pop1 <- pairwise_combn[1,i_col] pop2 <- pairwise_combn[2,i_col] - p1 <- pop_freqs_df[[pop1]]$freq_alt - p2 <- pop_freqs_df[[pop2]]$freq_alt - pbar <- (p1+p2)/2 - numerator <- ((p1-p2)^2) - denominator <- (2*pbar*(1-pbar)) + + n <-pop_freqs_df$n[,c(pop1,pop2)]/2 + sHo <-pop_freqs_df$het_obs[,c(pop1,pop2)] + mHo <- apply(sHo, 1, mean, na.rm = TRUE) + freq_alt <- pop_freqs_df$freq_alt[,c(pop1,pop2)] + freq_ref <- pop_freqs_df$freq_ref[,c(pop1,pop2)] + + # sum of squared frequencies + sp2 <- freq_alt^2+freq_ref^2 + Hs <- (1 - sp2 - sHo/2/n) + Hs <- n/(n - 1) * Hs + np <- apply(n, 1, fun <- function(x) sum(!is.na(x))) + # mean sample size over the populations + mn <- apply(n, 1, fun <- function(x) { + sum(!is.na(x))/sum(1/x[!is.na(x)]) + }) + # mean sum of square frequencies + msp2 <- apply(sp2, 1, mean, na.rm = TRUE) + mp2 <- rowMeans(freq_alt)^2+rowMeans(freq_ref)^2 + mHs <- mn/(mn - 1) * (1 - msp2 - mHo/2/mn) + Ht <- 1 - mp2 + mHs/mn/np - mHo/2/mn/np + + Dst <- Ht - mHs + Dstp <- np/(np - 1) * Dst + Htp = mHs + Dstp if (by_locus){ - Fst_locus[,i_col] = numerator/denominator + Fst_locus[,i_col] = Dstp/Htp } - Fst_tot[i_col]<-mean(numerator)/mean(denominator) + Fst_tot[i_col]<-mean(Dstp)/mean(Htp) } # format nicely the objects group_combinations <- cbind(.group_levels[pairwise_combn[1,],],.group_levels[pairwise_combn[2,],]) @@ -254,8 +281,4 @@ pairwise_pop_fst_nei86 <- function(.x, by_locus=FALSE){ -## use tidyr::pivot_wider to turn into a matrix if that's what is requested. - - - diff --git a/R/show_genotypes.R b/R/show_genotypes.R index 31419fac..5501caaf 100644 --- a/R/show_genotypes.R +++ b/R/show_genotypes.R @@ -11,7 +11,7 @@ #' extract information on the alleles for those loci from a [`gen_tibble`]. #' @rdname show_genotypes #' @export -show_genotypes <- function(.x, ...) { +show_genotypes <- function(.x, indiv_indices=NULL, loci_indices=NULL, ...) { UseMethod("show_genotypes", .x) } diff --git a/R/show_loci.R b/R/show_loci.R index a013a9b2..0e73341c 100644 --- a/R/show_loci.R +++ b/R/show_loci.R @@ -18,7 +18,7 @@ show_loci <- function(.x, ...) { show_loci.tbl_df <- function(.x, ...){ stopifnot_gen_tibble(.x) # extract the column and hand it over to its method - show_loci(.x$genotypes) + show_loci(.x$genotypes, ...) } diff --git a/R/utils-pipe.R b/R/utils-pipe.R index fd0b1d13..c1a45e24 100644 --- a/R/utils-pipe.R +++ b/R/utils-pipe.R @@ -7,6 +7,7 @@ #' @keywords internal #' @export #' @importFrom magrittr %>% +#' @importFrom magrittr %<>% #' @usage lhs \%>\% rhs #' @param lhs A value or the magrittr placeholder. #' @param rhs A function call using the magrittr semantics. diff --git a/R/utils.R b/R/utils.R index c4f90b14..ed5e5f70 100644 --- a/R/utils.R +++ b/R/utils.R @@ -29,3 +29,10 @@ stopifnot_diploid <- function(x){ } } +is_diploid_only <- function(x){ + if (inherits(x,"gen_tbl")){ + (attr(x$genotypes,"ploidy")==2) + } else { + (attr(x,"ploidy")==2) + } +} diff --git a/R/vcf_to_fbm.R b/R/vcf_to_fbm.R index 239001d4..277d6866 100644 --- a/R/vcf_to_fbm.R +++ b/R/vcf_to_fbm.R @@ -12,7 +12,7 @@ vcf_to_fbm <- function( vcf_path, - loci_per_chunk = 1000, + loci_per_chunk = 100000, backingfile = NULL, quiet=FALSE) { @@ -37,15 +37,18 @@ vcf_to_fbm <- function( ) chunks_vec_index <- c(1, chunks_vec) - # figure out ploidy from the first 1000 markers + # figure out ploidy from the first marker temp_vcf <- vcfR::read.vcfR( vcf_path, - nrow = min(1000,no_variants), + nrow = 1, verbose = !quiet ) - temp_gt <- vcfR::extract.gt(temp_vcf) + temp_gt <- vcfR::extract.gt(temp_vcf,convertNA = FALSE) ploidy <- apply(temp_gt,2,get_ploidy) - max_ploidy <- max(ploidy, na.rm = TRUE) # remove NA in case we failed to figure out the ploidy of an individual + if (any(is.na(ploidy))){ + stop("error whilst determining ploidy") + } + max_ploidy <- max(ploidy) # set up codes for the appropriate ploidy level code256 <- rep(NA_real_, 256) @@ -142,7 +145,7 @@ poly_indiv_dosage <- function (x, max_ploidy){ sapply(strsplit(x,"[/|]"),poly_genotype_dosage , max_ploidy) } -# get dosages for a genotype x (as a vector of 0 and 1) +# get dosages for a genotype x and return them as raw for inclusion in the filebacked matrix poly_genotype_dosage <- function (x, max_ploidy){ if (!is.na(x[1]) && x[1]!="."){ as.raw(sum(as.numeric(x))) diff --git a/data-raw/admixtools_temp.R b/data-raw/admixtools_temp.R new file mode 100644 index 00000000..b22c6360 --- /dev/null +++ b/data-raw/admixtools_temp.R @@ -0,0 +1,139 @@ +########################################################################################### +# everything works up to here +# the function below needs to be modified to use gt_extract_afs + + +gt_extract_f2_large = function(x, outdir, blgsize = 0.05, cols_per_chunk = 10, + maxmiss = 0, minmaf = 0, maxmaf = 0.5, minac2 = FALSE, outpop = NULL, outpop_scale = TRUE, + transitions = TRUE, transversions = TRUE, + keepsnps = NULL, snpblocks = NULL, overwrite = FALSE, format = NULL, + adjust_pseudohaploid = TRUE, afprod = TRUE, fst = TRUE, poly_only = c('f2'), + apply_corr = TRUE, verbose = TRUE) { + inds = NULL + pops = NULL + if(!quiet) message(paste0('Extracting allele frequencies...\n')) + snpdat = extract_afs(pref, outdir, inds = inds, pops = pops, cols_per_chunk = cols_per_chunk, numparts = 100, + maxmiss = maxmiss, minmaf = minmaf, maxmaf = maxmaf, minac2 = minac2, outpop = outpop, + transitions = transitions, transversions = transversions, + keepsnps = keepsnps, format = format, poly_only = FALSE, + adjust_pseudohaploid = adjust_pseudohaploid, verbose = verbose) + + numchunks = length(list.files(outdir, 'afs.+rds')) + + if(!is.null(outpop) && outpop_scale) { + p = snpdat$outpopaf + snpwt = 1/(p*(1-p)) + } else snpwt = NULL + if(isTRUE(poly_only)) poly_only = c('f2', 'ap', 'fst') + + if(verbose) alert_warning(paste0('Computing ', choose(numchunks+1, 2), ' chunk pairs. If this takes too long, + consider running "extract_afs" and then paralellizing over "afs_to_f2".\n')) + for(i in 1:numchunks) { + for(j in i:numchunks) { + if(verbose) alert_info(paste0('Writing pair ', i, ' - ', j, '...\r')) + afs_to_f2(outdir, outdir, chunk1 = i, chunk2 = j, blgsize = blgsize, snpdat = snpdat, + snpwt = snpwt, overwrite = overwrite, type = 'f2', poly_only = 'f2' %in% poly_only, + apply_corr = apply_corr) + if(afprod) afs_to_f2(outdir, outdir, chunk1 = i, chunk2 = j, blgsize = blgsize, snpdat = snpdat, + snpwt = snpwt, overwrite = overwrite, type = 'ap', poly_only = 'ap' %in% poly_only) + if(fst) afs_to_f2(outdir, outdir, chunk1 = i, chunk2 = j, blgsize = blgsize, snpdat = snpdat, + snpwt = snpwt, overwrite = overwrite, type = 'fst', poly_only = 'fst' %in% poly_only) + } + } + if(verbose) alert_info('\n') + if(verbose) alert_info(paste0('Deleting allele frequency files...\n')) + unlink(paste0(outdir, c('/afs*.rds', '/counts*.rds'))) +} + + +## threre is something off with this function, ti does not seem to replicate extract_afs +gt_extract_afs <- function (.x, outdir = "./afs", cols_per_chunk = 10, + numparts = 100, maxmiss = 0, minmaf = 0, maxmaf = 0.5, minac2 = FALSE, + outpop = NULL, transitions = TRUE, transversions = TRUE, + format = NULL, poly_only = FALSE, adjust_pseudohaploid = TRUE, + quiet = FALSE){ + # variables that don't make sense with gen_tibble + inds = NULL + pops = NULL + auto_only = FALSE + keepsnps = NULL + + warning ("this function is untested") + if (!inherits(.x,"grouped_df")){ + stop (".x should be a grouped df") + } + if (!dir.exists(outdir)){ + dir.create(outdir) + } + verbose <- !quiet + afdat = gt_to_aftable(.x)#, inds = inds, pops = pops, + # format = format, adjust_pseudohaploid = adjust_pseudohaploid, + # verbose = verbose) + afdat %<>% discard_from_aftable(maxmiss = maxmiss, minmaf = minmaf, + maxmaf = maxmaf, minac2 = minac2, outpop = outpop, transitions = transitions, + transversions = transversions, keepsnps = keepsnps, auto_only = auto_only, + poly_only = poly_only) + afdat$snpfile <- afdat$snpfile %>% mutate(poly = as.logical(cpp_is_polymorphic(afdat$afs))) + if (!quiet) + message(paste0(nrow(afdat$afs), " SNPs remain after filtering. ", + sum(afdat$snpfile$poly), " are polymorphic.\n")) + admixtools::split_mat(afdat$afs, cols_per_chunk = cols_per_chunk, prefix = paste0(outdir, + "/afs"), verbose = verbose) + admixtools::split_mat(afdat$counts, cols_per_chunk = cols_per_chunk, + prefix = paste0(outdir, "/counts"), verbose = verbose) + block_lengths = admixtools::get_block_lengths(afdat$snpfile %>% filter(.data$poly), + blgsize = blgsize) + block_lengths_a = admixtools::get_block_lengths(afdat$snpfile, blgsize = blgsize) + saveRDS(block_lengths, file = paste0(outdir, "/block_lengths.rds")) + saveRDS(block_lengths_a, file = paste0(outdir, "/block_lengths_a.rds")) + readr::write_tsv(afdat$snpfile, paste0(outdir, "/snpdat.tsv.gz")) + invisible(afdat$snpfile) +} + +#' Compute and store blocked f2 statistics for a `gen_tibble` +#' +#' This function prepares data for various `ADMIXTOOLS 2` function, and it is +#' equivalent to `admixtools::extract_f2`. An important difference is that +#' the filtering for snps (e.g. by maf) or populations is not performed by this +#' function; it is expected that the `gen_tibble` has been filtered appropriately. +#' +#' @param .x the `gen_tibble`, appropriately filtered for individuals, populations and snps +#' @param outdir the directory where the f2 stats will be stored. If left NULL, a directory +#' names 'f2' will be created in the same path as the RDS fo the `gen_tibble` +#' @param blgsize SNP block size in Morgan. Default is 0.05 (5 cM). If `blgsize` +#' is 100 or greater, if will be interpreted as base pair distance rather than centimorgan distance. +#' @param cols_per_chunk Number of allele frequency chunks to store on disk. +#' Setting this to a positive integer makes the function slower, +#' but requires less memory. The default value for cols_per_chunk +#' in extract_afs is 10. Lower numbers will lower the memory requirement +#' but increase the time it takes. +#' @param quiet boolean on whether the progess should be silenced +#' @returns SNP metadata (invisibly) +#' @keywords internal + +## TO BE TESTED +gt_extract_f2_broken <- function(.x , outdir=NULL, cols_per_chunk = 10, blgsize = 0.05, quiet=FALSE){ + warning ("this function is untested") + if (!inherits(.x,"grouped_df")){ + stop (".x should be a grouped df") + } + # if no outdir is given, create a subdirectory f2 in the path of the gen_tibble rds + if (is.null(outdir)){ + outdir <- file.path(dirname(.gt_get_bigsnp(.x)$genotypes$rds),"f2") + } + # we will place the af tables into a subdirectory + af_dir <- file.path(outdir,"af_tbls") + if (!dir.exists(af_dir)){ + dir.create(af_dir,recursive = TRUE) + } + gt_extract_afs(.x, outdir = af_dir, cols_per_chunk = cols_per_chunk, + blgsize = blgsize, quiet= quiet) + numchunks = length(list.files(af_dir, 'afs.+rds')) + for(i in 1:numchunks) { + for(j in i:numchunks) { + admixtools::afs_to_f2(af_dir, outdir, chunk1 = i, chunk2 = j) + } + } +} + +#gt_extract_f2(test_gt) diff --git a/R/loci_sums.R b/data-raw/loci_sums.R similarity index 100% rename from R/loci_sums.R rename to data-raw/loci_sums.R diff --git a/data-raw/test_group_freq.R b/data-raw/test_group_freq.R new file mode 100644 index 00000000..6bf7abd3 --- /dev/null +++ b/data-raw/test_group_freq.R @@ -0,0 +1,25 @@ +test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,NA,0,0), + c(2,NA,0,0,1,1), + c(1,0,0,1,0,0), + c(1,2,0,1,2,1), + c(0,0,0,0,NA,1), + c(0,1,1,0,1,NA)) +test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"), + population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3")) +test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + +test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) +test_gt <- test_gt %>% dplyr::group_by(population) + +.x<-test_gt +geno_fbm <- .gt_get_bigsnp(.x)$genotypes + +foo<-gt_group_freq(geno_fbm, .gt_bigsnp_rows(.x),.gt_bigsnp_cols(.x),dplyr::group_indices(.x)-1,max(dplyr::group_indices(.x)),1) + +foo2 <- group_map(.x, .f=~.gt_pop_freqs(.x)) diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd index 891e1e8a..4b4f5ec7 100644 --- a/man/gen_tibble.Rd +++ b/man/gen_tibble.Rd @@ -40,14 +40,17 @@ gen_tibble( \arguments{ \item{x}{can be: \itemize{ -\item a string giving the path to a PLINK BED or PED file. The correspective -BIM and FAM fiel for the BED, or MAP for PED are expected to be in the same +\item a string giving the path to a PLINK BED or PED file. The associated +BIM and FAM files for the BED, or MAP for PED are expected to be in the same directory and have the same file name. \item a string giving the path to a RDS file storing a \code{bigSNP} object from the \code{bigsnpr} package (usually created with \code{\link[bigsnpr:snp_readBed]{bigsnpr::snp_readBed()}}) \item a string giving the path to a vcf file. Note that we currently read the whole vcf in memory with \code{vcfR}, so only smallish vcf can be imported. Only biallelic SNPs will be considered. +\item a string giving the path to a packedancestry .geno file. The associated +.ind and .snp files are expected to be in the same directory and share the +same file name prefix. \item a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate allele. }} diff --git a/man/gt_extract_f2.Rd b/man/gt_extract_f2.Rd new file mode 100644 index 00000000..a6ca08a6 --- /dev/null +++ b/man/gt_extract_f2.Rd @@ -0,0 +1,125 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gt_extract_f2.R +\name{gt_extract_f2} +\alias{gt_extract_f2} +\title{Compute and store blocked f2 statistics for ADMIXTOOLS 2} +\usage{ +gt_extract_f2( + .x, + outdir = NULL, + blgsize = 0.05, + maxmem = 8000, + maxmiss = 0, + minmaf = 0, + maxmaf = 0.5, + minac2 = FALSE, + outpop = NULL, + outpop_scale = TRUE, + transitions = TRUE, + transversions = TRUE, + overwrite = FALSE, + adjust_pseudohaploid = TRUE, + fst = TRUE, + afprod = TRUE, + poly_only = c("f2"), + apply_corr = TRUE, + n_cores = 1, + quiet = FALSE +) +} +\arguments{ +\item{.x}{a \code{\link{gen_tibble}}} + +\item{outdir}{Directory where data will be stored.} + +\item{blgsize}{SNP block size in Morgan. Default is 0.05 (5 cM). If \code{blgsize} +is 100 or greater, if will be interpreted as base pair distance rather than +centimorgan distance.} + +\item{maxmem}{Maximum amount of memory to be used. If the required amount +of memory exceeds \code{maxmem}, allele frequency data will be split into blocks, +and the computation will be performed separately on each block pair. +This doesn't put a precise cap on the amount of memory used (it used to +at some point). Set this parameter to lower values if you run out of memory +while running this function. Set it to higher values if this function is +too slow and you have lots of memory.} + +\item{maxmiss}{Discard SNPs which are missing in a fraction of populations +higher than \code{maxmiss}} + +\item{minmaf}{Discard SNPs with minor allele frequency less than \code{minmaf}} + +\item{maxmaf}{Discard SNPs with minor allele frequency greater than +than \code{maxmaf}} + +\item{minac2}{Discard SNPs with allele count lower than 2 in any population +(default \code{FALSE}). This option should be set to \code{TRUE} when computing +f3-statistics where one population consists mostly of pseudohaploid samples. +Otherwise heterozygosity estimates and thus f3-estimates can be biased. +\code{minac2 == 2} will discard SNPs with allele count lower than 2 in any +non-singleton population (this option is experimental and is based on the +hypothesis that using SNPs with allele count lower than 2 only leads to +biases in non-singleton populations). Note that, While the \code{minac2} option +discards +SNPs with allele count lower than 2 in any population, the \code{qp3pop} +function will only discard SNPs with allele count lower than 2 in the +first (target) population (when the first argument is the prefix of +a genotype file; i.e. it is applied directly to a genotype file, not via +precomputing f2 from a \code{\link{gen_tibble}}).} + +\item{outpop}{Keep only SNPs which are heterozygous in this population} + +\item{outpop_scale}{Scale f2-statistics by the inverse \code{outpop} +heteroygosity (\code{1/(p*(1-p))}). Providing \code{outpop} and setting +\code{outpop_scale} to \code{TRUE} will give the same results as the original +\emph{qpGraph} when the \code{outpop} parameter has been set, but it has the +disadvantage of treating one population different from the others. This +may limit the use of these f2-statistics for other models.} + +\item{transitions}{Set this to \code{FALSE} to exclude transition SNPs} + +\item{transversions}{Set this to \code{FALSE} to exclude transversion SNPs} + +\item{overwrite}{Overwrite existing files in \code{outdir}} + +\item{adjust_pseudohaploid}{Genotypes of pseudohaploid samples are +usually coded as \code{0} or \code{2}, even though only one allele is observed. +\code{adjust_pseudohaploid} ensures that the observed allele count increases +only by \code{1} for each pseudohaploid sample. If \code{TRUE} (default), samples +that don't have any genotypes coded as \code{1} among the first 1000 SNPs are +automatically identified as pseudohaploid. This leads to slightly more +accurate estimates of f-statistics. Setting this parameter to \code{FALSE} +treats all samples as diploid and is equivalent to the \emph{ADMIXTOOLS} \code{ inbreed: NO} option. Setting \code{adjust_pseudohaploid} to an integer \code{n} +will check the first \code{n} SNPs instead of the first 1000 SNPs.} + +\item{fst}{Write files with pairwise FST for every population pair. +Setting this to FALSE can make \code{extract_f2} faster and will require less memory.} + +\item{afprod}{Write files with allele frequency products for every +population pair. Setting this to FALSE can make \code{extract_f2} faster and +will require less memory.} + +\item{poly_only}{Specify whether SNPs with identical allele frequencies in +every population should be discarded (\code{poly_only = TRUE}), or whether they +should be used (\code{poly_only = FALSE}). By default (\code{poly_only = c("f2")}), +these SNPs will be used to compute FST and allele frequency products, but +not to compute f2 (this is the default option in the original ADMIXTOOLS).} + +\item{apply_corr}{Apply small-sample-size correction when computing +f2-statistics (default \code{TRUE})} + +\item{n_cores}{Parallelize computation across \code{n_cores} cores.} + +\item{quiet}{Suppress printing of progress updates} +} +\value{ +SNP metadata (invisibly) +} +\description{ +This function prepares data for various \emph{ADMIXTOOLS 2} functions fro the +package \emph{ADMIXTOOLS 2}. It +takes a \code{\link{gen_tibble}}, +computes allele frequencies and blocked f2-statistics, +and writes the results to \code{outdir}. It is equivalent to +\code{admixtools::extract_f2()}. +} diff --git a/man/gt_to_aftable.Rd b/man/gt_to_aftable.Rd new file mode 100644 index 00000000..fcd8859e --- /dev/null +++ b/man/gt_to_aftable.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gt_extract_f2.R +\name{gt_to_aftable} +\alias{gt_to_aftable} +\title{Create an allele frequency table (aftable) for admixtools} +\usage{ +gt_to_aftable(.x, adjust_pseudohaploid = TRUE, n_cores = bigstatsr::nb_cores()) +} +\arguments{ +\item{.x}{the \code{\link{gen_tibble}}} + +\item{adjust_pseudohaploid}{Genotypes of pseudohaploid samples are +usually coded as \code{0} or \code{2}, even though only one allele is observed. +\code{adjust_pseudohaploid} ensures that the observed allele count increases +only by \code{1} for each pseudohaploid sample. If \code{TRUE} (default), samples +that don't have any genotypes coded as \code{1} among the first 1000 SNPs are +automatically identified as pseudohaploid. This leads to slightly more +accurate estimates of f-statistics. Setting this parameter to \code{FALSE} +treats all samples as diploid and is equivalent to the \emph{ADMIXTOOLS} \code{ inbreed: NO} option. Setting \code{adjust_pseudohaploid} to an integer \code{n} +will check the first \code{n} SNPs instead of the first 1000 SNPs.} + +\item{n_cores}{Parallelize computation across \code{n_cores} cores.} +} +\value{ +a list of 3 elements: afs, counts, and snp +} +\description{ +This function is equivalent to \code{anygeno_to_aftable()}, but the aftable is +created directly from the \code{gen_tibble}. +} +\keyword{internal} diff --git a/man/identify_pseudohaploids.Rd b/man/identify_pseudohaploids.Rd new file mode 100644 index 00000000..84150095 --- /dev/null +++ b/man/identify_pseudohaploids.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/gt_extract_f2.R +\name{identify_pseudohaploids} +\alias{identify_pseudohaploids} +\title{Identify pseudohaploids} +\usage{ +identify_pseudohaploids(x, n_test = 1000) +} +\arguments{ +\item{x}{the gen_tibble} + +\item{n_test}{the number of loci being tested} +} +\value{ +a numeric vector of ploidy +} +\description{ +Pseudohaploids are coded as all homozygotes; we find them by checking the +first 'n_test' loci and call a pseudohaploid if they have zero heterozygosity +(this is the same strategy employed in admixtools) +} +\keyword{internal} diff --git a/man/loci_alt_freq.Rd b/man/loci_alt_freq.Rd index 601e277d..a69c5fe3 100644 --- a/man/loci_alt_freq.Rd +++ b/man/loci_alt_freq.Rd @@ -17,7 +17,7 @@ loci_alt_freq(.x, ...) \method{loci_alt_freq}{vctrs_bigSNP}(.x, ...) -\method{loci_alt_freq}{grouped_df}(.x, ...) +\method{loci_alt_freq}{grouped_df}(.x, n_cores = bigstatsr::nb_cores(), ...) loci_maf(.x, ...) @@ -25,7 +25,7 @@ loci_maf(.x, ...) \method{loci_maf}{vctrs_bigSNP}(.x, ...) -\method{loci_maf}{grouped_df}(.x, ...) +\method{loci_maf}{grouped_df}(.x, n_cores = bigstatsr::nb_cores(), ...) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotypes} column of @@ -33,6 +33,8 @@ a \code{\link{gen_tibble}} object), or a \code{\link{gen_tibble}}.} \item{...}{other arguments passed to specific methods, currently unused.} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} } \value{ a vector of frequencies, one per locus diff --git a/man/loci_missingness.Rd b/man/loci_missingness.Rd index 9b7ed1fd..3dcc2545 100644 --- a/man/loci_missingness.Rd +++ b/man/loci_missingness.Rd @@ -13,7 +13,7 @@ loci_missingness(.x, as_counts = FALSE, ...) \method{loci_missingness}{vctrs_bigSNP}(.x, as_counts = FALSE, ...) -\method{loci_missingness}{grouped_df}(.x, as_counts = FALSE, ...) +\method{loci_missingness}{grouped_df}(.x, as_counts = FALSE, n_cores = bigstatsr::nb_cores(), ...) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotypes} column of @@ -24,6 +24,8 @@ or a \code{\link{gen_tibble}}.} should be returned. It defaults to FALSE (i.e. rates are returned by default).} \item{...}{other arguments passed to specific methods.} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} } \value{ a vector of frequencies, one per locus diff --git a/man/loci_sums.Rd b/man/loci_sums.Rd deleted file mode 100644 index e11b2f8f..00000000 --- a/man/loci_sums.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/loci_sums.R -\name{loci_sums} -\alias{loci_sums} -\alias{loci_sums.tbl_df} -\alias{loci_sums.vctrs_bigSNP} -\alias{loci_sums.grouped_df} -\title{Estimates the sum of genotypes at each each locus} -\usage{ -loci_sums(.x, ...) - -\method{loci_sums}{tbl_df}(.x, ...) - -\method{loci_sums}{vctrs_bigSNP}(.x, ...) - -\method{loci_sums}{grouped_df}(.x, ...) -} -\arguments{ -\item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of -a \code{\link{gen_tibble}} object), -or a \code{\link{gen_tibble}}.} - -\item{...}{other arguments passed to specific methods.} -} -\value{ -a vector of frequencies, one per locus -} -\description{ -Estimate the sum of the alternate allele at each locus. This is unlikely to be useful -directly, but it is used by other functions that compute various statistics. -} diff --git a/man/pairwise_pop_fst.Rd b/man/pairwise_pop_fst.Rd index c35011b9..465b1dce 100644 --- a/man/pairwise_pop_fst.Rd +++ b/man/pairwise_pop_fst.Rd @@ -7,7 +7,8 @@ pairwise_pop_fst( .x, by_locus = FALSE, - method = c("Hudson", "Nei87", "Nei86", "WC84") + method = c("Hudson", "Nei87", "Nei86", "WC84"), + n_cores = bigstatsr::nb_cores() ) } \arguments{ @@ -18,6 +19,8 @@ or as a single genome wide value obtained by taking the ratio of the mean numera and denominator (FALSE, the default).} \item{method}{one of 'Hudson', 'Nei86', 'Nei87', and 'WC84'} + +\item{n_cores}{number of cores to be used, it defaults to \code{\link[bigstatsr:reexports]{bigstatsr::nb_cores()}}} } \value{ a tibble of genome-wide pairwise Fst values with each pairwise combination @@ -25,7 +28,6 @@ as a row if "by_locus=FALSE", else a list including the tibble of genome-wide values as well as a matrix with pairwise Fst by locus with loci as rows and and pairwise combinations as columns. -a matrix } \description{ This function computes pairwise Fst. The following methods are implemented: diff --git a/man/show_genotypes.Rd b/man/show_genotypes.Rd index 75ec847a..fabae74b 100644 --- a/man/show_genotypes.Rd +++ b/man/show_genotypes.Rd @@ -6,7 +6,7 @@ \alias{show_genotypes.vctrs_bigSNP} \title{Show the genotypes of a \code{gen_tibble}} \usage{ -show_genotypes(.x, ...) +show_genotypes(.x, indiv_indices = NULL, loci_indices = NULL, ...) \method{show_genotypes}{tbl_df}(.x, indiv_indices = NULL, loci_indices = NULL, ...) @@ -17,11 +17,11 @@ show_genotypes(.x, ...) a \code{\link{gen_tibble}} object), or a \code{\link{gen_tibble}}.} -\item{...}{currently unused.} - \item{indiv_indices}{indices of individuals} \item{loci_indices}{indices of loci} + +\item{...}{currently unused.} } \value{ a matrix of counts of the alternative alleles (see \code{\link[=show_loci]{show_loci()}}) to diff --git a/man/vcf_to_fbm.Rd b/man/vcf_to_fbm.Rd index 1dee0022..d9550174 100644 --- a/man/vcf_to_fbm.Rd +++ b/man/vcf_to_fbm.Rd @@ -4,7 +4,7 @@ \alias{vcf_to_fbm} \title{Convert vcf to FBM.} \usage{ -vcf_to_fbm(vcf_path, loci_per_chunk = 1000, backingfile = NULL, quiet = FALSE) +vcf_to_fbm(vcf_path, loci_per_chunk = 1e+05, backingfile = NULL, quiet = FALSE) } \arguments{ \item{vcf_path}{the path to the vcf} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 75ed8959..2c788e71 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,6 +11,71 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// gt_grouped_alt_freq_diploid +ListOf gt_grouped_alt_freq_diploid(Environment BM, const IntegerVector& rowInd, const IntegerVector& colInd, const IntegerVector& groupIds, int ngroups, int ncores); +RcppExport SEXP _tidypopgen_gt_grouped_alt_freq_diploid(SEXP BMSEXP, SEXP rowIndSEXP, SEXP colIndSEXP, SEXP groupIdsSEXP, SEXP ngroupsSEXP, SEXP ncoresSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Environment >::type BM(BMSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type rowInd(rowIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type colInd(colIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type groupIds(groupIdsSEXP); + Rcpp::traits::input_parameter< int >::type ngroups(ngroupsSEXP); + Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); + rcpp_result_gen = Rcpp::wrap(gt_grouped_alt_freq_diploid(BM, rowInd, colInd, groupIds, ngroups, ncores)); + return rcpp_result_gen; +END_RCPP +} +// gt_grouped_alt_freq_pseudohap +ListOf gt_grouped_alt_freq_pseudohap(Environment BM, const IntegerVector& rowInd, const IntegerVector& colInd, const IntegerVector& groupIds, int ngroups, const IntegerVector& ploidy, int ncores); +RcppExport SEXP _tidypopgen_gt_grouped_alt_freq_pseudohap(SEXP BMSEXP, SEXP rowIndSEXP, SEXP colIndSEXP, SEXP groupIdsSEXP, SEXP ngroupsSEXP, SEXP ploidySEXP, SEXP ncoresSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Environment >::type BM(BMSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type rowInd(rowIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type colInd(colIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type groupIds(groupIdsSEXP); + Rcpp::traits::input_parameter< int >::type ngroups(ngroupsSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type ploidy(ploidySEXP); + Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); + rcpp_result_gen = Rcpp::wrap(gt_grouped_alt_freq_pseudohap(BM, rowInd, colInd, groupIds, ngroups, ploidy, ncores)); + return rcpp_result_gen; +END_RCPP +} +// gt_grouped_missingness +NumericMatrix gt_grouped_missingness(Environment BM, const IntegerVector& rowInd, const IntegerVector& colInd, const IntegerVector& groupIds, int ngroups, int ncores); +RcppExport SEXP _tidypopgen_gt_grouped_missingness(SEXP BMSEXP, SEXP rowIndSEXP, SEXP colIndSEXP, SEXP groupIdsSEXP, SEXP ngroupsSEXP, SEXP ncoresSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Environment >::type BM(BMSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type rowInd(rowIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type colInd(colIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type groupIds(groupIdsSEXP); + Rcpp::traits::input_parameter< int >::type ngroups(ngroupsSEXP); + Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); + rcpp_result_gen = Rcpp::wrap(gt_grouped_missingness(BM, rowInd, colInd, groupIds, ngroups, ncores)); + return rcpp_result_gen; +END_RCPP +} +// gt_grouped_summaries +ListOf gt_grouped_summaries(Environment BM, const IntegerVector& rowInd, const IntegerVector& colInd, const IntegerVector& groupIds, int ngroups, int ncores); +RcppExport SEXP _tidypopgen_gt_grouped_summaries(SEXP BMSEXP, SEXP rowIndSEXP, SEXP colIndSEXP, SEXP groupIdsSEXP, SEXP ngroupsSEXP, SEXP ncoresSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Environment >::type BM(BMSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type rowInd(rowIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type colInd(colIndSEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type groupIds(groupIdsSEXP); + Rcpp::traits::input_parameter< int >::type ngroups(ngroupsSEXP); + Rcpp::traits::input_parameter< int >::type ncores(ncoresSEXP); + rcpp_result_gen = Rcpp::wrap(gt_grouped_summaries(BM, rowInd, colInd, groupIds, ngroups, ncores)); + return rcpp_result_gen; +END_RCPP +} // SNPHWE2 double SNPHWE2(int32_t obs_hets, int32_t obs_hom1, int32_t obs_hom2, uint32_t midp); RcppExport SEXP _tidypopgen_SNPHWE2(SEXP obs_hetsSEXP, SEXP obs_hom1SEXP, SEXP obs_hom2SEXP, SEXP midpSEXP) { @@ -106,6 +171,10 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { + {"_tidypopgen_gt_grouped_alt_freq_diploid", (DL_FUNC) &_tidypopgen_gt_grouped_alt_freq_diploid, 6}, + {"_tidypopgen_gt_grouped_alt_freq_pseudohap", (DL_FUNC) &_tidypopgen_gt_grouped_alt_freq_pseudohap, 7}, + {"_tidypopgen_gt_grouped_missingness", (DL_FUNC) &_tidypopgen_gt_grouped_missingness, 6}, + {"_tidypopgen_gt_grouped_summaries", (DL_FUNC) &_tidypopgen_gt_grouped_summaries, 6}, {"_tidypopgen_SNPHWE2", (DL_FUNC) &_tidypopgen_SNPHWE2, 4}, {"_tidypopgen_SNPHWE_t", (DL_FUNC) &_tidypopgen_SNPHWE_t, 4}, {"_tidypopgen_SNPHWE_midp_t", (DL_FUNC) &_tidypopgen_SNPHWE_midp_t, 4}, diff --git a/src/gt_grouped_alt_freq_diploid.cpp b/src/gt_grouped_alt_freq_diploid.cpp new file mode 100644 index 00000000..125730cb --- /dev/null +++ b/src/gt_grouped_alt_freq_diploid.cpp @@ -0,0 +1,43 @@ +/******************************************************************************/ + +#include + +/******************************************************************************/ + +// [[Rcpp::export]] +ListOf gt_grouped_alt_freq_diploid(Environment BM, + const IntegerVector& rowInd, + const IntegerVector& colInd, + const IntegerVector& groupIds, + int ngroups, + int ncores) { + + XPtr xpBM = BM["address"]; + SubBMCode256Acc macc(xpBM, rowInd, colInd, BM["code256"], 1); + + size_t n = macc.nrow(); // number of individuals + size_t m = macc.ncol(); // number of loci + + NumericMatrix freq(m, ngroups); + NumericMatrix valid_alleles(m, ngroups); + +#pragma omp parallel for num_threads(ncores) + for (size_t j = 0; j < m; j++) { + for (size_t i = 0; i < n; i++) { + double x = macc(i, j); + if (x>-1){ + freq(j, groupIds[i]) += x; + valid_alleles(j, groupIds[i]) +=2; + } + } + // now for each group, divide freq by valid_alleles + for (size_t group_i = 0; group_i < ngroups; group_i++) { + freq(j, group_i) = freq(j, group_i) / valid_alleles(j, group_i); + } + } + + return List::create(_["freq_alt"] = freq, + _["n"] = valid_alleles); +} + +/******************************************************************************/ diff --git a/src/gt_grouped_alt_freq_pseudohap.cpp b/src/gt_grouped_alt_freq_pseudohap.cpp new file mode 100644 index 00000000..e4a1dcf0 --- /dev/null +++ b/src/gt_grouped_alt_freq_pseudohap.cpp @@ -0,0 +1,49 @@ +/******************************************************************************/ + +#include + +/******************************************************************************/ + +// [[Rcpp::export]] +ListOf gt_grouped_alt_freq_pseudohap(Environment BM, + const IntegerVector& rowInd, + const IntegerVector& colInd, + const IntegerVector& groupIds, + int ngroups, + const IntegerVector& ploidy, + int ncores) { + + XPtr xpBM = BM["address"]; + SubBMCode256Acc macc(xpBM, rowInd, colInd, BM["code256"], 1); + + size_t n = macc.nrow(); // number of individuals + size_t m = macc.ncol(); // number of loci + + NumericMatrix freq(m, ngroups); + NumericMatrix valid_alleles(m, ngroups); + +#pragma omp parallel for num_threads(ncores) + for (size_t j = 0; j < m; j++) { + for (size_t i = 0; i < n; i++) { + double x = macc(i, j); + if (x>-1){ + if (ploidy[i]==2){ + freq(j, groupIds[i]) += x; + valid_alleles(j, groupIds[i]) +=2; + } else { + freq(j, groupIds[i]) += x/2; + valid_alleles(j, groupIds[i]) +=1; + } + } + } + // now for each group, divide freq by valid_alleles + for (size_t group_i = 0; group_i < ngroups; group_i++) { + freq(j, group_i) = freq(j, group_i) / valid_alleles(j, group_i); + } + } + + return List::create(_["freq_alt"] = freq, + _["n"] = valid_alleles); +} + +/******************************************************************************/ diff --git a/src/gt_grouped_missingness.cpp b/src/gt_grouped_missingness.cpp new file mode 100644 index 00000000..524f92eb --- /dev/null +++ b/src/gt_grouped_missingness.cpp @@ -0,0 +1,35 @@ +/******************************************************************************/ + +#include + +/******************************************************************************/ + +// [[Rcpp::export]] +NumericMatrix gt_grouped_missingness(Environment BM, + const IntegerVector& rowInd, + const IntegerVector& colInd, + const IntegerVector& groupIds, + int ngroups, + int ncores) { + + XPtr xpBM = BM["address"]; + SubBMCode256Acc macc(xpBM, rowInd, colInd, BM["code256"], 1); + + size_t n = macc.nrow(); // number of individuals + size_t m = macc.ncol(); // number of loci + + NumericMatrix na_counts(m, ngroups); + +#pragma omp parallel for num_threads(ncores) + for (size_t j = 0; j < m; j++) { + for (size_t i = 0; i < n; i++) { + double x = macc(i, j); + if (!(x>-1)){ + na_counts(j, groupIds[i]) +=1; + } + } + } + return na_counts; +} + +/******************************************************************************/ diff --git a/src/gt_grouped_summaries.cpp b/src/gt_grouped_summaries.cpp new file mode 100644 index 00000000..d65010a5 --- /dev/null +++ b/src/gt_grouped_summaries.cpp @@ -0,0 +1,55 @@ +/******************************************************************************/ + +#include + +/******************************************************************************/ +// Summarise a grouped gen_tibble +// this only works for diploid data + + +// [[Rcpp::export]] +ListOf gt_grouped_summaries(Environment BM, + const IntegerVector& rowInd, + const IntegerVector& colInd, + const IntegerVector& groupIds, + int ngroups, + int ncores) { + + XPtr xpBM = BM["address"]; + SubBMCode256Acc macc(xpBM, rowInd, colInd, BM["code256"], 1); + + size_t n = macc.nrow(); // number of individuals + size_t m = macc.ncol(); // number of loci + + NumericMatrix freq(m, ngroups); + NumericMatrix ref_freq(m, ngroups); + NumericMatrix valid_alleles(m, ngroups); + NumericMatrix heterozygotes(m, ngroups); + +#pragma omp parallel for num_threads(ncores) + for (size_t j = 0; j < m; j++) { + for (size_t i = 0; i < n; i++) { + double x = macc(i, j); + if (x>-1){ + freq(j, groupIds[i]) += x; + valid_alleles(j, groupIds[i]) +=2; + if (x==1){ + heterozygotes(j, groupIds[i]) +=2; // we add 2 so that we can then divide by valid alleles + } + } + } + // now for each group, divide freq by valid_alleles + for (size_t group_i = 0; group_i < ngroups; group_i++) { + freq(j, group_i) = freq(j, group_i) / valid_alleles(j, group_i); + ref_freq(j, group_i) = 1-freq(j, group_i); + heterozygotes(j, group_i) = heterozygotes(j, group_i) / valid_alleles(j, group_i); + } + } + + return List::create(_["freq_alt"] = freq, + _["freq_ref"] = ref_freq, + _["n"] = valid_alleles, + _["het_obs"] = heterozygotes); +} + +/******************************************************************************/ diff --git a/tests/testthat/test_gt_extract_f2.R b/tests/testthat/test_gt_extract_f2.R new file mode 100644 index 00000000..69f09113 --- /dev/null +++ b/tests/testthat/test_gt_extract_f2.R @@ -0,0 +1,54 @@ +test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,NA,0,0), + c(2,NA,0,0,1,1), + c(2,0,0,2,0,0), + c(1,2,0,1,2,1), + c(0,0,0,0,NA,2), + c(0,1,2,0,1,NA)) +test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"), + population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3")) +test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + +test_gt <- gen_tibble(x = test_genotypes, + loci = test_loci, + indiv_meta = test_indiv_meta, + quiet = TRUE) +test_gt <- test_gt %>% group_by(population) + +test_that("extract f2 correctly",{ + # process the data with admixtools (note that we get some warnings) + bed_file <- gt_as_plink(test_gt, file = tempfile("test_bed")) + + # test af table + # without adjusting pseudohaploids + adm_aftable <- admixtools:::anygeno_to_aftable(bigsnpr::sub_bed(bed_file), + adjust_pseudohaploid = FALSE) + gt_aftable <- gt_to_aftable(test_gt, adjust_pseudohaploid = FALSE) + expect_true(all.equal(adm_aftable, gt_aftable, check.attributes= FALSE)) + # now adjusting the pseudohaploids + adm_aftable <- admixtools:::anygeno_to_aftable(bigsnpr::sub_bed(bed_file), + adjust_pseudohaploid = TRUE) + gt_aftable <- gt_to_aftable(test_gt) + expect_true(all.equal(adm_aftable, gt_aftable, check.attributes= FALSE)) + + adm_outdir <- file.path(tempdir(),"adm_f2") + unlink(file.path(adm_outdir,"*"),recursive = TRUE) + # we get a few warnings due to the small sample size + suppressWarnings(admixtools::extract_f2(bigsnpr::sub_bed(bed_file), outdir = adm_outdir)) + expect_warning(adm_f2 <- admixtools::f2_from_precomp(adm_outdir)) + # now try to do the same with gen_tibble + gt_outdir <- file.path(tempdir(),"gt_f2") + unlink(file.path(gt_outdir,"*"),recursive = TRUE) + # we get same warning due to the small dataset + # TODO can we capture the warnings above and then check that they are the same?!? + suppressWarnings(gt_extract_f2(test_gt, outdir = gt_outdir)) + expect_warning(gt_f2 <- admixtools::f2_from_precomp(adm_outdir)) + expect_true(all.equal(adm_f2, gt_f2)) + + +}) diff --git a/tests/testthat/test_loci_freq.R b/tests/testthat/test_loci_freq.R index 9a8ac109..3179d14c 100644 --- a/tests/testthat/test_loci_freq.R +++ b/tests/testthat/test_loci_freq.R @@ -1,6 +1,6 @@ test_that("loci_alt_freq and loci_maf computes correctly",{ test_indiv_meta <- data.frame (id=c("a","b","c"), - population = c("pop1","pop1","pop2")) + population = c("pop1","pop1","pop2")) test_genotypes <- rbind(c(1,1,0,1,1,2), c(2,1,0,NA,0,NA), c(2,2,0,0,1,NA)) @@ -10,10 +10,7 @@ test_that("loci_alt_freq and loci_maf computes correctly",{ genetic_dist = as.integer(rep(0,6)), allele_ref = c("A","T","C","G","C","T"), allele_alt = c("T","C", NA,"C","G","A")) - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) - - # raw frequencies freq <- colSums(test_genotypes, na.rm=TRUE)/(c(3,3,3,2,3,1)*2) expect_true(all(loci_alt_freq(test_gt$genotypes)==freq)) @@ -31,3 +28,35 @@ test_that("loci_alt_freq and loci_maf computes correctly",{ freq[freq>0.5] <- 1 - freq[freq>0.5] expect_true(all(loci_maf(test_gt$genotypes)==freq)) }) + +test_that("loci_alt_freq and loci_maf on grouped tibbles",{ + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,NA,0,0), + c(2,NA,0,0,1,1), + c(1,0,0,1,0,0), + c(1,2,0,1,2,1), + c(0,0,0,0,NA,1), + c(0,1,1,0,1,NA)) + test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"), + population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3")) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + test_gt <- test_gt %>% group_by(population) + # compute by using group map + loci_freq_map <- test_gt %>% group_map(.f=~loci_alt_freq(.x)) + # use fast cpp code (limit cores to 2) + loci_freq_grp <- test_gt %>% loci_alt_freq(n_cores=2) + all.equal(loci_freq_map, loci_freq_grp) + # and now for maf + # compute by using group map + loci_maf_map <- test_gt %>% group_map(.f=~loci_maf(.x)) + # use fast cpp code (limit cores to 2) + loci_maf_grp <- test_gt %>% loci_maf(n_cores=2) + expect_true(all.equal(loci_maf_map, loci_maf_grp)) +}) diff --git a/tests/testthat/test_loci_ld_clump.R b/tests/testthat/test_loci_ld_clump.R index 4d5e83f8..17d879d1 100644 --- a/tests/testthat/test_loci_ld_clump.R +++ b/tests/testthat/test_loci_ld_clump.R @@ -11,7 +11,7 @@ test_loci <- data.frame(name=paste0("rs",1:6), allele_alt = c("T","C", NA,"C","G","A")) test_gt <- gen_tibble(x = test_genotypes, indiv_meta = test_indiv_meta, loci = test_loci, - backingfile = tempfile()) + backingfile = tempfile(), quiet = TRUE) test_that("ld clumping runs",{ diff --git a/tests/testthat/test_loci_missingness.R b/tests/testthat/test_loci_missingness.R index f17f07ea..c448915f 100644 --- a/tests/testthat/test_loci_missingness.R +++ b/tests/testthat/test_loci_missingness.R @@ -1,4 +1,4 @@ -test_that("snpbin_list_means computes correctly",{ +test_that("loci_missingness",{ test_indiv_meta <- data.frame (id=c("a","b","c"), population = c("pop1","pop1","pop2")) test_genotypes <- rbind(c(1,1,0,1,1,2), @@ -21,3 +21,30 @@ test_that("snpbin_list_means computes correctly",{ # convert to frequencies expect_true(all(loci_missingness(test_gt$genotypes)==n_na/nrow(test_genotypes))) }) + +test_that("loci_missingness on grouped tibble",{ + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,NA,0,0), + c(2,NA,0,0,1,1), + c(1,0,0,1,0,0), + c(1,2,0,1,2,1), + c(0,0,0,0,NA,1), + c(0,1,1,0,1,NA)) + test_indiv_meta <- data.frame (id=c("a","b","c","d","e","f","g"), + population = c("pop1","pop1","pop2","pop2","pop1","pop3","pop3")) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.integer(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) + test_gt <- test_gt %>% group_by(population) + # compute by using group map + loci_miss_map <- test_gt %>% group_map(.f=~loci_missingness(.x)) + # use fast cpp code (limit cores to 2) + loci_miss_grp <- test_gt %>% loci_missingness(n_cores=2) + expect_true(all.equal(loci_miss_map, loci_miss_grp)) + +}) diff --git a/tests/testthat/test_pairwise_pop_fst.R b/tests/testthat/test_pairwise_pop_fst.R index ea97a6c5..f8ed5108 100644 --- a/tests/testthat/test_pairwise_pop_fst.R +++ b/tests/testthat/test_pairwise_pop_fst.R @@ -17,12 +17,13 @@ test_loci <- data.frame(name=paste0("rs",1:6), test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE) -test_that("pop_fst and pop_fist compute correctly",{ +test_that("pairwise_pop_fst compute correctly",{ test_gt <- test_gt %>% dplyr::group_by(population) test_hier <- gt_as_hierfstat(test_gt) # compare results against hierfstat for Nei87 (Nei86 does not correct for Ho # when computing Ht, so it gives a different result) nei_gt <- test_gt %>% pairwise_pop_fst(method="Nei87") + nei_hier <- hierfstat::pairwise.neifst(test_hier) # hiefstat values are rounded to 4 dp expect_true(all.equal(tidy_dist_matrix(nei_hier)$value, round(nei_gt$value,4))) diff --git a/tests/testthat/test_ploidy_vcf.R b/tests/testthat/test_ploidy_vcf.R new file mode 100644 index 00000000..708f61a2 --- /dev/null +++ b/tests/testthat/test_ploidy_vcf.R @@ -0,0 +1,9 @@ +test_that("import a vcf with multiple ploidy",{ + vcf_path <- system.file("/extdata/ploidy_test.vcf.gz", + package = "tidypopgen") + test_gt <- gen_tibble(vcf_path, backingfile = tempfile(), quiet = TRUE) + # it's mixed ploidy + expect_true(show_ploidy(test_gt)==0) + # individuals are either 2 or 4 + expect_true(all(indiv_ploidy(test_gt) %in% c(2,4))) +}) diff --git a/vignettes/overview.Rmd b/vignettes/overview.Rmd index 7d8e3925..2a4ab234 100644 --- a/vignettes/overview.Rmd +++ b/vignettes/overview.Rmd @@ -31,12 +31,12 @@ be unique for each individual), `population` giving the population the individuals belong to (a `factor`), and `genotypes` (stored in a compressed format as a File-Backed Matrix, with the vector in the tibble providing the row indices of the matrix for each individual). The real data reside on disk, and an attribute `bigsnp` of -the `genotype` column contains all the information to acess it. There is also an +the `genotype` column contains all the information to access it. There is also an additional attribute , `loci` which provides all the information about the loci, including the column indices that represent in each locu in the FBM. The vector of row indices and the table of loci can be subsetted and reordered without changing the data on disk; thus, any operation on the `gen_tibble` is fast as it -shapes the indices of the genotyp matrix rather than the matrix itself. +shapes the indices of the genotype matrix rather than the matrix itself. The `loci` tibble includes columns `big_index` for the index in the FBM, `name` for the locus name (a `character`, which must be unique), `chromosome` for the chromosome (`a factor`, if known, otherwise set to `NA`), `position` for the position on the chromosome (`numeric`, if known, @@ -49,13 +49,12 @@ centimorgans) can be added as columns in the `loci` attribute table. [the class type of each column needs checking!!!] In principle, it is possible to use use multiple ways to compress the genotypes. -`tidypopgen` currently uses `bigSNP` from the package `bigsnpr`. It is very -fast and well documented, but it is mostly geared towards diploid data. However, -with a bit of work it could be expanded to multiple ploidy. NOte that the -infrastructure is present to adopt alternative compression approaches in the future -the only constraint is that the data have to be representable as a list of n elements -where n is the number of individuals (i.e. one object per individual). For example, -we could keep the data in memory, and have a `SNPBin` object in each element of the list. +`tidypopgen` currently uses a `bigSNP` object from the package `bigsnpr`. It is very +fast and well documented, but it is mostly geared towards diploid data. `tidypopgen` +expands that object to deal with different levels of ploidy, including multiple ploidy +within a single dataset; however, most functions are currently incompatible with +ploidy levels other than 2 (but they will return a clear error message and avoid +computing anything incorrectly). ## The grammar of population genetics