Skip to content

Commit

Permalink
Merge pull request #48 from EvolEcolGroup/fast_vcf
Browse files Browse the repository at this point in the history
Fast vcf parsing
  • Loading branch information
dramanica authored Aug 9, 2024
2 parents 79c827e + 7ef4c43 commit 6a7fca6
Show file tree
Hide file tree
Showing 26 changed files with 892 additions and 23 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ Imports:
stringr,
tidyselect,
tidyr,
utils,
Rcpp,
UpSetR,
vctrs
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ export(gt_as_genlight)
export(gt_as_geno_lea)
export(gt_as_hierfstat)
export(gt_as_plink)
export(gt_as_vcf)
export(gt_cluster_pca)
export(gt_cluster_pca_best_k)
export(gt_dapc)
Expand Down
12 changes: 12 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,15 @@ increment_king_numerator <- function(k, n_Aa_i, genotype0, genotype1, genotype2,
invisible(.Call(`_tidypopgen_increment_king_numerator`, k, n_Aa_i, genotype0, genotype1, genotype2, genotype_valid, BM, rowInd, colInd))
}

extractAltAlleleCountsFromVCF <- function(filename, allele_counts, ploidy, numIndividuals, missingValue, maxLoci = 1000L, skipLoci = 100L, diploid = FALSE) {
.Call(`_tidypopgen_extractAltAlleleCountsFromVCF`, filename, allele_counts, ploidy, numIndividuals, missingValue, maxLoci, skipLoci, diploid)
}

get_ploidy_from_VCF <- function(filename) {
.Call(`_tidypopgen_get_ploidy_from_VCF`, filename)
}

write_to_FBM <- function(BM, allele_counts, col_start, n_loci, ncores) {
invisible(.Call(`_tidypopgen_write_to_FBM`, BM, allele_counts, col_start, n_loci, ncores))
}

18 changes: 15 additions & 3 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@
#' if `x` is a vcf or packedancestry file)
#' @param ... if `x` is the name of a vcf file, additional arguments
#' passed to [vcfR::read.vcfR()]. Otherwise, unused.
#' @param parser the name of the parser used for VCF, either "cpp" to use
#' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
#' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
#' an unusual structure.
#' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
#' 'C' and 'G'.
#' @param missing_alleles a vector of values in the BIM file/loci dataframe that
Expand Down Expand Up @@ -70,12 +74,20 @@ gen_tibble <-
#' @rdname gen_tibble
gen_tibble.character <-
function(x,
..., chunk_size = NULL,
...,
parser = c("vcfR","cpp"),
chunk_size = NULL,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL,
quiet = FALSE) {

# parser for vcf
parser <- match.arg(parser)
if (parser=="cpp"){
message("The cpp parser is still experimental, use vcfR for serious work")
}

# check that valid alleles does not contain zero
if ("0" %in% valid_alleles){
stop ("'0' can not be a valid allele (it is the default missing allele value!)")
Expand All @@ -89,7 +101,7 @@ gen_tibble.character <-
backingfile = backingfile,
quiet = quiet)
} else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
return(gen_tibble_vcf(x = x, ..., chunk_size = chunk_size,
return(gen_tibble_vcf(x = x, ..., parser = parser, chunk_size = chunk_size,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile, quiet = quiet))
Expand All @@ -108,7 +120,7 @@ gen_tibble.character <-
} else {
stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
}
file_in_use <- gt_save(x_gt, quiet = quiet)
file_in_use <- gt_save_light(x_gt, quiet = quiet)
return(x_gt)
}

Expand Down
18 changes: 13 additions & 5 deletions R/gen_tibble_vcf.R
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
# read in a vcf
gen_tibble_vcf <- function(x, ...,
gen_tibble_vcf <- function(x, ..., parser = "vcfR",
chunk_size = NULL,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL, quiet = FALSE) {
rds_path <- vcf_to_fbm(x,
backingfile = backingfile,
chunk_size = chunk_size,
quiet = quiet)
if (parser == "cpp"){
rds_path <- vcf_to_fbm_cpp(x,
backingfile = backingfile,
chunk_size = chunk_size,
quiet = quiet)
} else {
rds_path <- vcf_to_fbm_vcfR(x,
backingfile = backingfile,
chunk_size = chunk_size,
quiet = quiet)
}

if (!quiet){
message("converting to a gen_tibble...")
}
Expand Down
94 changes: 94 additions & 0 deletions R/gt_as_vcf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#' Convert a `gen_tibble` to a VCF
#'
#' This function write a VCF from a `gen_tibble`.
#'
#' @param x a [`gen_tibble`], with population coded as 'population'
#' @param file the .vcf file name with a path, or NULL (the default) to use the
#' location of the backing files.
#' @param chunk_size the number of loci
#' processed at a time. Automatically set if left to NULL
#' @param overwrite logical, should the file be overwritten if it already exists?
#' @returns the path of the .vcf file
#' @export
#'

gt_as_vcf <- function(x, file = NULL, chunk_size = NULL, overwrite = FALSE){

# vcf writing currently only works for diploid data
stopifnot_diploid(x)

if (is.null(file)){
file <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,paste0(".vcf"))
}
if (file_ext(file)!="vcf"){
file <- paste0(file,".vcf")
}
if (!overwrite && file.exists(file)){
stop("file ", file," already exists; use 'overwrite=TRUE' to overwrite it")
}

# if chunk is null, get the best guess of an efficient approach
if (is.null(chunk_size)){
chunk_size <- bigstatsr::block_size(nrow(x))
}

# set up chunks
chunks_vec <- c(
rep(chunk_size, floor(count_loci(x) / chunk_size)),
count_loci(x) %% chunk_size
)
chunks_vec_index <- c(1,cumsum(chunks_vec))

# generate the header
vcf_header <- c("##fileformat=VCFv4.3",
paste0("##fileDate=",format(Sys.time(), "%Y%m%e")),
paste0("##source=tidypopgen_v",utils::packageVersion("tidypopgen")))
# create copy of loci table
loci_tbl <- show_loci(x)
# reorder chromosomes levels in the order in which they appear
loci_tbl$chromosome <- forcats::fct_inorder(loci_tbl$chromosome)
# get max position for each chromosome
chromosome_summary <- loci_tbl %>% group_by(.data$chromosome) %>% summarise(max = max(.data$position))
for (chrom_i in 1:nrow(chromosome_summary)){
vcf_header <- c(vcf_header,
paste0("##contig=<ID=", chromosome_summary$chromosome[chrom_i],
",length=",chromosome_summary$max[chrom_i]+1,
">"))
}
vcf_header <- c(vcf_header, '##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">')
vcf_header <- c(vcf_header, '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
vcf_header <- c(vcf_header, paste0("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t",
paste(x$id, collapse="\t")))
# write it to file
utils::write.table(vcf_header, file = file, quote = FALSE, row.names = FALSE,
col.names = F)
# iterate over genotypes in chunks and append to the vcf body
for (chunk_i in seq_along(chunks_vec)) {
genotypes_matrix <- t(show_genotypes(x,
loci_indices =
chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1]))
genotypes_matrix[genotypes_matrix==0] <- "0/0"
genotypes_matrix[genotypes_matrix==1] <- "0/1"
genotypes_matrix[genotypes_matrix==2] <- "1/1"
genotypes_matrix[is.na(genotypes_matrix)] <- "./."
# subset loci to this chunk
loci_sub <- show_loci(x)[chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1],]
# add the other columns needed for the
loci_cols <- c("chromosome", "position", "name", "allele_ref", "allele_alt")
loci_sub <- loci_sub %>% select(any_of(loci_cols)) %>%
mutate(qual=".",filter=".", info="PR", format = "GT")
loci_sub[is.na(loci_sub)] <- "."
genotypes_matrix <- cbind(loci_sub, genotypes_matrix)
# append table to previous chunk
utils::write.table(genotypes_matrix, file = file, quote = FALSE, append = TRUE, sep="\t",
col.names = FALSE, row.names = FALSE)
}
return(file)

}

# TODO for tests
# read pop_a_vcf
# rewrite it as a vcf
# re-read it and check that we got back what we started with

36 changes: 33 additions & 3 deletions R/gt_save.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,49 @@
#' @seealso [gt_load()]
#' @export

gt_save <-function(x, file_name = NULL, quiet = FALSE) {
gt_save <- function(x, file_name = NULL, quiet = FALSE) {
if (!inherits(x, "gen_tbl")){
stop("x should be a gen_tibble")
}
# we update the bigsnp object
bigsnpr::snp_save(attr(x$genotypes,"bigsnp"))

if (is.null(file_name)){
file_name <- bigstatsr::sub_bk(gt_get_file_names(x)[2],".gt")
}
if (file_ext(file_name)!="gt"){
file_name <- paste0(file_name,".gt")
}
# we update the bigsnp object
bigsnpr::snp_save(attr(x$genotypes,"bigsnp"))
# and now save our gen_tibble
saveRDS(x,file_name)
if (!quiet){
message("\ngen_tibble saved to ", file_name)
message("using bigSNP file: ", gt_get_file_names(x)[1])
message("with backing file: ", gt_get_file_names(x)[2])
message("make sure that you do NOT delete those files!")
message("to reload the gen_tibble in another session, use:")
message("gt_load('",file_name,"')")
}
return(c(file_name,gt_get_file_names(x)))
}

#' a light version of gt_save that does not resave the RDS, to be used internally
#' when creating a gen_tibble (if we have just created it, there is not need
#' to resave it)
#' @param x a [`gen_tibble`]
#' @param file_name the file name, including the full path. If it does not end
#' with *.gt*, the extension will be added.
#' @param quiet boolean to suppress information about hte files
#' @returns the file name and path of the *.gt* file, together with the *.rds*
#' and *.bk* files
#' @keywords internal
gt_save_light <- function(x, file_name = NULL, quiet = FALSE) {
if (is.null(file_name)){
file_name <- bigstatsr::sub_bk(gt_get_file_names(x)[2],".gt")
}
if (file_ext(file_name)!="gt"){
file_name <- paste0(file_name,".gt")
}
# and now save our gen_tibble
saveRDS(x,file_name)
if (!quiet){
Expand Down
1 change: 0 additions & 1 deletion R/show_genotypes.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ show_genotypes.tbl_df <- function(.x, indiv_indices=NULL, loci_indices=NULL, ...
#' @rdname show_genotypes
show_genotypes.vctrs_bigSNP <- function(.x, indiv_indices=NULL, loci_indices=NULL, ...){
rlang::check_dots_empty()
#browser()
indiv_big_index <- vctrs::vec_data(.x)
if (!is.null(indiv_indices)){
indiv_big_index <- indiv_big_index[indiv_indices]
Expand Down
Loading

0 comments on commit 6a7fca6

Please sign in to comment.