Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ploidy #36

Merged
merged 36 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c892b37
store ploidy info (all fixed to diploid)
dramanica Apr 20, 2024
8eedb6c
example dataset
dramanica Apr 20, 2024
64b7da6
correctly track ploidy from matrix
dramanica Apr 21, 2024
274821c
better handling of ploidy
dramanica Apr 21, 2024
330eb38
speed up alt_freq
dramanica Apr 21, 2024
88a7b8b
fix subsetting
dramanica Apr 21, 2024
39d560b
tidy docs
dramanica Apr 21, 2024
1a5b668
compute frequency in polyploids
dramanica Apr 22, 2024
22ef591
add loci missingness for polyploids
dramanica Apr 22, 2024
eed8b6a
make indiv_missingness faster and work with polyploids
dramanica Apr 22, 2024
7aeaf0b
Merge branch 'ploidy' of github.com:EvolEcolGroup/tidypopgen into ploidy
dramanica Apr 23, 2024
312411d
count_vcf_variants
dramanica Mar 26, 2024
5da977b
Resolve dapc conflict
Euphrasiologist May 3, 2024
8349d5d
Resolve gen_tibble conflict
Euphrasiologist May 3, 2024
ea88548
Implement loci_maf()
dramanica Mar 26, 2024
635cfbd
test gen_tibble error
eviecarter33 Mar 22, 2024
416170f
Add count_vcf_individuals
Euphrasiologist Mar 27, 2024
0bbd860
Format gt_vcf_to_bed.R
Euphrasiologist Mar 28, 2024
bad7d59
First attempy at vcf to fbm function. Note, not added to tests, or in…
Euphrasiologist Mar 28, 2024
362ee1c
Merge branch 'main' into ploidy
dramanica May 3, 2024
18c921d
resolve merge issues
dramanica May 3, 2024
701d41b
small clean up
dramanica May 3, 2024
724298c
Merge branch 'main' into ploidy
dramanica May 10, 2024
e3f1e12
clean up to have a ploidy example
dramanica May 10, 2024
6ac0ae3
keep some functions as internal
dramanica May 10, 2024
3d5b200
adapt vcf for ploidy
dramanica May 10, 2024
9f98ede
adapt vcf_to_fbm to deal with ploidy
dramanica May 10, 2024
4f5e822
remove examples for internal functions
dramanica May 10, 2024
9fd3051
initial implementation of vcf reading for ploidy
dramanica May 11, 2024
5d1e8e0
vcf fix
dramanica May 11, 2024
f29a06b
add test data vcfs
eviecarter33 May 13, 2024
23d5ebe
Merge pull request #35 from EvolEcolGroup/add_vcfs
dramanica May 13, 2024
df59bde
test vcf input
dramanica May 13, 2024
36e49e7
improve warning message
dramanica May 13, 2024
a7e28ec
clean up test files
dramanica May 13, 2024
9eb84cd
improve vcf tests
dramanica May 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ S3method(indiv_het_obs,vctrs_bigSNP)
S3method(indiv_missingness,grouped_df)
S3method(indiv_missingness,tbl_df)
S3method(indiv_missingness,vctrs_bigSNP)
S3method(indiv_ploidy,grouped_df)
S3method(indiv_ploidy,tbl_df)
S3method(indiv_ploidy,vctrs_bigSNP)
S3method(loci_alt_freq,grouped_df)
S3method(loci_alt_freq,tbl_df)
S3method(loci_alt_freq,vctrs_bigSNP)
Expand Down Expand Up @@ -59,6 +62,8 @@ S3method(show_genotypes,tbl_df)
S3method(show_genotypes,vctrs_bigSNP)
S3method(show_loci,tbl_df)
S3method(show_loci,vctrs_bigSNP)
S3method(show_ploidy,tbl_df)
S3method(show_ploidy,vctrs_bigSNP)
S3method(summary,rbind_report)
S3method(summary,vctrs_bigSNP)
S3method(tbl_sum,gen_tbl)
Expand Down Expand Up @@ -94,6 +99,7 @@ export(gt_set_imputed)
export(gt_uses_imputed)
export(indiv_het_obs)
export(indiv_missingness)
export(indiv_ploidy)
export(loci_alt_freq)
export(loci_chromosomes)
export(loci_hwe)
Expand All @@ -116,6 +122,7 @@ export(select_loci)
export(select_loci_if)
export(show_genotypes)
export(show_loci)
export(show_ploidy)
export(snp_allele_sharing)
export(snp_ibs)
export(snp_king)
Expand Down
130 changes: 130 additions & 0 deletions R/count_vcf_variants.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#' Counts the number of variants in a vcf file
#'
#' Count the number of VCF variants by first parsing the header, and then
#' rapidly counting the number of platform-independent
#' newlines (CR, LF, and CR+LF), including a last line with neither.
#'
#' @author author Henrik Bengtsson for the original `countLines` in `R.utils`;
#' Andrea Manica for the modified version focussed on vcf
#' @param file name and path of vcf file (it can be compressed)
#' @param chunk_size The number of bytes read in each chunk.
#' @returns the number of variants (an integer)
#' @keywords internal

count_vcf_variants <- function(file, chunk_size = 50e6) {
if (!is.character(file)) {
stop("file should be a character giving the path of the vcf")
}
if (!file.exists(file)) {
stop("file is not a valid path to a vcf file")
}
con <- gzfile(file, open = "rb")
on.exit(close(con))

# skip the header (all lines start with #)
while (TRUE) {
a_line <- readLines(con, n = 1)
if (!substr(a_line, 1, 1) == "#") {
break
}
}

LF <- as.raw(0x0a)
CR <- as.raw(0x0d)
SPC <- as.raw(32L)

isLastCR <- isLastLF <- FALSE
isEmpty <- TRUE
nbrOfLines <- 0L
while (TRUE) {
bfr <- readBin(con = con, what = raw(), n = chunk_size)
if (isLastCR) {
# Don't count LF following a CR in previous chunk.
if (bfr[1L] == LF) {
bfr[1L] <- SPC
}
}

n <- length(bfr)
if (n == 0L) {
break
}

isEmpty <- FALSE

# Replace all CRLF:s to become LF:s
idxsCR <- which(bfr == CR)
nCR <- length(idxsCR)
if (nCR > 0L) {
idxsCRLF <- idxsCR[(bfr[idxsCR + 1L] == LF)]
if (length(idxsCRLF) > 0L) {
bfr <- bfr[-idxsCRLF]
n <- length(bfr)
idxsCRLF <- NULL # Not needed anymore
nCR <- length(which(bfr == CR))
}
}

# Count all CR:s and LF:s
nLF <- length(which(bfr == LF))
nbrOfLines <- nbrOfLines + (nCR + nLF)

if (n == 0L) {
isLastCR <- isLastLF <- FALSE
} else {
# If last symbol is CR it might be followed by a LF in
# the next chunk. If so, don't count that next LF.
bfrN <- bfr[n]
isLastCR <- (bfrN == CR)
isLastLF <- (bfrN == LF)
}
} # while()

# Count any last line without newline too
if (!isEmpty) {
if (!isLastLF) nbrOfLines <- nbrOfLines + 1L
attr(nbrOfLines, "lastLineHasNewline") <- isLastLF
}

# note the +1, since we read one variant whilst checking for the header
nbrOfLines + 1
}


#' Counts the number of individuals in a vcf file
#'
#' Count the number of VCF individuals by first parsing the header, and then
#' getting the first line, then separating on tabs.
#'
#' @author author Henrik Bengtsson for the original `countLines` in `R.utils`;
#' Andrea Manica for the modified version focussed on vcf; Max Carter-Brown modified
#' the file above.
#' @param file name and path of vcf file (it can be compressed)
#' @returns the number of individuals (an integer)
#' @keywords internal

count_vcf_individuals <- function(file) {
if (!is.character(file)) {
stop("file should be a character giving the path of the vcf")
}
if (!file.exists(file)) {
stop("file is not a valid path to a vcf file")
}
con <- gzfile(file, open = "rb")
on.exit(close(con))

# skip the header (all lines start with #)
while (TRUE) {
a_line <- readLines(con, n = 1)
if (!substr(a_line, 1, 1) == "#") {
break
}
}

# read the first variant line
a_line <- readLines(con, n = 1)
# split the line by tab
a_line <- strsplit(a_line, "\t")[[1]]
# count the number of individuals
sum(a_line[10:length(a_line)] != ".")
}
Loading