Skip to content

Commit

Permalink
move gt_as_vcf to R directory
Browse files Browse the repository at this point in the history
  • Loading branch information
dramanica committed Jul 1, 2024
1 parent 0cb1591 commit a820b63
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ export(gt_as_genlight)
export(gt_as_geno_lea)
export(gt_as_hierfstat)
export(gt_as_plink)
export(gt_as_vcf)
export(gt_cluster_pca)
export(gt_cluster_pca_best_k)
export(gt_dapc)
Expand Down
90 changes: 90 additions & 0 deletions R/gt_as_vcf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#' Convert a `gen_tibble` to a VCF
#'
#' This function write a VCF from a `gen_tibble`.
#'
#' @param x a [`gen_tibble`], with population coded as 'population'
#' @param file the .vcf filename with a path, or NULL (the default) to use the
#' location of the backing files.
#' @param chunk_size the number of loci
#' processed at a time. Autmatically set if left to NULL
#' @returns the path of the .vcf file
#' @export
#'

gt_as_vcf <- function(x, file = NULL, chunk_size = NULL, overwrite = TRUE){

if (is.null(file)){
file <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,paste0(".vcf"))

Check warning on line 17 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L16-L17

Added lines #L16 - L17 were not covered by tests
}
if (file_ext(file)!="vcf"){
file <- paste0(file,".vcf")

Check warning on line 20 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L19-L20

Added lines #L19 - L20 were not covered by tests
}
if (!overwrite && file.exists(file)){
stop("file ", file," already exists; use 'overwrite=TRUE' to overwrite it")

Check warning on line 23 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L22-L23

Added lines #L22 - L23 were not covered by tests
}

# if chunk is null, get the best guess of an efficient approach
if (is.null(chunk_size)){
chunk_size <- bigstatsr::block_size(nrow(x))

Check warning on line 28 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L27-L28

Added lines #L27 - L28 were not covered by tests
}

# set up chunks
chunks_vec <- c(
rep(chunk_size, floor(count_loci(x) / chunk_size)),
count_loci(x) %% chunk_size
)
chunks_vec_index <- c(1,cumsum(chunks_vec))

Check warning on line 36 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L32-L36

Added lines #L32 - L36 were not covered by tests

# generate the header
vcf_header <- c("##fileformat=VCFv4.3",
paste0("##fileDate=",format(Sys.time(), "%Y%m%e")),
paste0("##source=tidypopgen_v",packageVersion("tidypopgen")))

Check warning on line 41 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L39-L41

Added lines #L39 - L41 were not covered by tests
# create copy of loci table
loci_tbl <- show_loci(x)

Check warning on line 43 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L43

Added line #L43 was not covered by tests
# reorder chromosomes levels in the order in which they appear
loci_tbl$chromosome <- forcats::fct_inorder(loci_tbl$chromosome)

Check warning on line 45 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L45

Added line #L45 was not covered by tests
# get max position for each chromosome
chromosome_summary <- loci_tbl %>% group_by(chromosome) %>% summarise(max = max(.data$position))
for (chrom_i in 1:nrow(chromosome_summary)){
vcf_header <- c(vcf_header,
paste0("##contig=<ID=", chromosome_summary$chromosome[chrom_i],
",length=",chromosome_summary$max[chrom_i]+1,
">"))

Check warning on line 52 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L47-L52

Added lines #L47 - L52 were not covered by tests
}
vcf_header <- c(vcf_header, '##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">')
vcf_header <- c(vcf_header, '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
vcf_header <- c(vcf_header, paste0("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t",
paste(x$id, collapse="\t")))

Check warning on line 57 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L54-L57

Added lines #L54 - L57 were not covered by tests
# write it to file
write.table(vcf_header, file = file, quote = FALSE, row.names = FALSE,
col.names = F)

Check warning on line 60 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L59-L60

Added lines #L59 - L60 were not covered by tests
# iterate over genotypes in chunks and append to the vcf body
for (chunk_i in seq_along(chunks_vec)) {
genotypes_matrix <- t(show_genotypes(x,
loci_indices =
chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1]))
genotypes_matrix[genotypes_matrix==0] <- "0/0"
genotypes_matrix[genotypes_matrix==1] <- "0/1"
genotypes_matrix[genotypes_matrix==2] <- "1/1"
genotypes_matrix[is.na(genotypes_matrix)] <- "./."

Check warning on line 69 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L62-L69

Added lines #L62 - L69 were not covered by tests
# subset loci to this chunk
loci_sub <- show_loci(x)[chunks_vec_index[chunk_i]:chunks_vec_index[chunk_i+1],]

Check warning on line 71 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L71

Added line #L71 was not covered by tests
# add the other columns needed for the
loci_cols <- c("chromosome", "position", "name", "allele_ref", "allele_alt")
loci_sub <- loci_sub %>% select(any_of(loci_cols)) %>%
mutate(qual=".",filter=".", info="PR", format = "GT")
loci_sub[is.na(loci_sub)] <- "."
genotypes_matrix <- cbind(loci_sub, genotypes_matrix)

Check warning on line 77 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L73-L77

Added lines #L73 - L77 were not covered by tests
# append table to previous chunk
write.table(genotypes_matrix, file = file, quote = FALSE, append = TRUE,
col.names = FALSE, row.names = FALSE)

Check warning on line 80 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L79-L80

Added lines #L79 - L80 were not covered by tests
}
return(file)

Check warning on line 82 in R/gt_as_vcf.R

View check run for this annotation

Codecov / codecov/patch

R/gt_as_vcf.R#L82

Added line #L82 was not covered by tests

}

# TODO for tests
# read pop_a_vcf
# rewrite it as a vcf
# re-read it and check that we got back what we started with

23 changes: 23 additions & 0 deletions man/gt_as_vcf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a820b63

Please sign in to comment.