From 84e0c6ca9ead588fb1fbbe11cdfafccdbf156447 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 12 Jun 2024 07:00:34 +0100 Subject: [PATCH 1/5] read packedancestry in chunks --- R/gen_tibble.R | 23 ++++++--- R/gen_tibble_packedancestry.R | 84 +++++++++++++++++++++++-------- R/gen_tibble_packedancestry_old.R | 50 ++++++++++++++++++ tests/testthat/test_gen_tibble.R | 14 ++++-- 4 files changed, 140 insertions(+), 31 deletions(-) create mode 100644 R/gen_tibble_packedancestry_old.R diff --git a/R/gen_tibble.R b/R/gen_tibble.R index 7bc81390..e6790977 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -155,15 +155,20 @@ gen_tibble_bed_rds <- function(x, ..., indiv_meta$maternal_ID <- fam_info$maternal.ID indiv_meta$maternal_ID[indiv_meta$maternal_ID==0]<-NA } - if(!all(fam_info$sex==0)){ - indiv_meta$sex <- dplyr::case_match( - fam_info$sex, - 1 ~ "male", - 2 ~ "female", - .default = NA, - .ptype = factor(levels = c("female", "male")) - ) + # if sex is numeric + if (inherits(fam_info$sex,"numeric")){ + if(!all(fam_info$sex==0)){ + indiv_meta$sex <- dplyr::case_match( + fam_info$sex, + 1 ~ "male", + 2 ~ "female", + .default = NA, + .ptype = factor(levels = c("female", "male")) + ) + } } + + if (inherits(fam_info$affection,"numeric")){ if(!all(fam_info$affection %in% c(0,-9))){ indiv_meta$phenotype <- dplyr::case_match( fam_info$affection, @@ -174,6 +179,8 @@ gen_tibble_bed_rds <- function(x, ..., .ptype = factor(levels = c("control", "case")) ) } + } + new_gen_tbl <- tibble::new_tibble( indiv_meta, class = "gen_tbl" diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R index 3af95a2a..ead68beb 100644 --- a/R/gen_tibble_packedancestry.R +++ b/R/gen_tibble_packedancestry.R @@ -2,6 +2,7 @@ gen_tibble_packedancestry <- function(x, ..., valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0","."), + chunk_size = 100, backingfile = NULL, quiet = FALSE) { # Substitute .ped with .map map_file <- sub("\\.geno$", ".snp", x) @@ -20,32 +21,75 @@ gen_tibble_packedancestry <- function(x, ..., backingfile <- bigstatsr::sub_bk(backingfile,"") } - res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x), + # read the packed ancestry in chunks + # count the individuals + indiv_table <- read.table(ind_file, header = FALSE) + names(indiv_table) <- c("id","sex","population") + no_individuals <- nrow(indiv_table) + + # count the loci + loci_table <- read.table(map_file, header = FALSE) + names(loci_table)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt') + loci_table$allele_alt[loci_table$allele_alt == "X"] <- NA + no_variants <- nrow(loci_table) + + # create a matrix to store the data + file_backed_matrix <- bigstatsr::FBM.code256( + nrow = no_individuals, + ncol = no_variants, + code = bigsnpr::CODE_012, + backingfile = backingfile + ) + + + # set up chunks + chunks <- split(1:no_individuals, ceiling(seq_along(1:no_individuals)/chunk_size)) + + for (i in chunks) { + res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x), transpose = TRUE, - ...) - names(res$ind)<-c("id","sex","population") - #TODO check that allele_ref and allele_alt are not swapped - names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt') + inds = indiv_table$id[i], + ..., verbose = !quiet) + res$geno[is.na(res$geno)]<-3 + #browser() + # now insert the genotypes in the FBM + file_backed_matrix[ + i, + + ] <- res$geno + } - allele_alt <- res$snp$allele_ref - allele_ref <- res$snp$allele_alt + # save the fbm + file_backed_matrix$save() - loci <- cbind(res$snp[,c(1:4)], allele_ref, allele_alt) + # convert the info from the indiv and loci table + fam <- tibble(family.ID = indiv_table$population, + sample.ID = indiv_table$id, + paternal.ID = 0, + maternal.ID = 0, + sex = 0, # this needs fixing! + affection = 0, + ploidy = 2) - xs <- which(loci$allele_ref == "X") + map <- tibble(chromosome = loci_table$chromosome, + marker.ID = loci_table$name, + genetic.dist = loci_table$genetic_dist, + physical.pos = loci_table$position, + allele1 = loci_table$allele_ref, + allele2 = loci_table$allele_alt) - loci[xs, "allele_ref"] <- NA + bigsnp_obj <- structure(list( + genotypes = file_backed_matrix, + fam = fam, + map = map + ), class = "bigSNP") - # using the gen_tibble.matrix method - new_gen_tbl <- gen_tibble(x = res$geno, - indiv_meta = res$ind, - #loci = res$snp, - loci = loci, - backingfile = backingfile, - quiet=quiet, - valid_alleles = valid_alleles, - missing_alleles = missing_alleles) + bigsnp_obj <- bigsnpr::snp_save(bigsnp_obj) - return(new_gen_tbl) + gen_tibble( bigsnp_obj$genotypes$rds, + valid_alleles = valid_alleles, + missing_alleles = missing_alleles, + backingfile = backingfile, + quiet = quiet) } diff --git a/R/gen_tibble_packedancestry_old.R b/R/gen_tibble_packedancestry_old.R new file mode 100644 index 00000000..015e4c8c --- /dev/null +++ b/R/gen_tibble_packedancestry_old.R @@ -0,0 +1,50 @@ +#A function to read geno packedancestrymap files +gen_tibble_packedancestry_old <- function(x, ..., + valid_alleles = c("A", "T", "C", "G"), + missing_alleles = c("0","."), + backingfile = NULL, quiet = FALSE) { + # Substitute .ped with .map + map_file <- sub("\\.geno$", ".snp", x) + if (!file.exists(map_file)){ + stop("snp file ",map_file," does not exist") + } + ind_file <- sub("\\.geno$", ".ind", x) + if (!file.exists(ind_file)){ + stop("ind file ",ind_file," does not exist") + } + + if (is.null(backingfile)){ + backingfile <- sub("\\.geno$", "", x) + } + if (file_ext(backingfile)=="bk"){ + backingfile <- bigstatsr::sub_bk(backingfile,"") + } + + res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x), + transpose = TRUE, + ..., verbose = !quiet) + names(res$ind)<-c("id","sex","population") + #TODO check that allele_ref and allele_alt are not swapped + names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt') + + allele_alt <- res$snp$allele_ref + allele_ref <- res$snp$allele_alt + + loci <- cbind(res$snp[,c(1:4)], allele_ref, allele_alt) + + xs <- which(loci$allele_ref == "X") + + loci[xs, "allele_ref"] <- NA + + # using the gen_tibble.matrix method + new_gen_tbl <- gen_tibble(x = res$geno, + indiv_meta = res$ind, + loci = loci, + backingfile = backingfile, + quiet=quiet, + valid_alleles = valid_alleles, + missing_alleles = missing_alleles) + + return(new_gen_tbl) + +} diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 3d92fe2d..127b0a84 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -160,6 +160,9 @@ test_that("gen_tibble identifies wrong loci table columns",{ test_that("gen_tibble from files",{ + ######################## + # PLINK BED files + ######################## bed_path <- system.file("extdata/pop_a.bed", package = "tidypopgen") pop_a_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile()) # now read the dosages created by plink when saving in raw format @@ -167,7 +170,9 @@ test_that("gen_tibble from files",{ mat <- as.matrix(raw_file_pop_a[,7:ncol(raw_file_pop_a)]) mat <- unname(mat) expect_true(all.equal(mat,show_genotypes(pop_a_gt))) - # now read in the ped file + ######################## + # PLINK PED files + ######################## ped_path <- system.file("extdata/pop_a.ped", package = "tidypopgen") pop_a_ped_gt <- gen_tibble(ped_path, quiet=TRUE,backingfile = tempfile()) # because ref and alt are defined based on which occurs first in a ped, some alleles will be swapped @@ -177,7 +182,9 @@ test_that("gen_tibble from files",{ expect_true(all(show_loci(pop_a_gt)$allele_alt[not_equal] == show_loci(pop_a_ped_gt)$allele_ref[not_equal])) # check that the mismatches are all in the homozygotes expect_true(all(abs(show_genotypes(pop_a_gt)[, not_equal]-show_genotypes(pop_a_ped_gt)[, not_equal]) %in% c(0,2))) - # now read in vcf + ######################## + # PLINK VCF files + ######################## vcf_path <- system.file("extdata/pop_a.vcf", package = "tidypopgen") pop_a_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile()) expect_true(all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_vcf_gt))) @@ -185,7 +192,8 @@ test_that("gen_tibble from files",{ pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), loci_per_chunk=2) expect_true(all.equal(show_genotypes(pop_a_vcf_gt2),show_genotypes(pop_a_vcf_gt))) expect_true(all.equal(show_loci(pop_a_vcf_gt2),show_loci(pop_a_vcf_gt))) - # we should add similar tests for pop b, which has missing data + + # @TODO we should add similar tests for pop b, which has missing data }) From 25b4b97395ef172515ff0fd7499f32c240ec7ee8 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 12 Jun 2024 08:45:14 +0100 Subject: [PATCH 2/5] exposed chunk size --- R/gen_tibble.R | 9 +++++---- R/gen_tibble_packedancestry.R | 12 ++++++++---- R/gen_tibble_vcf.R | 17 +++++++++++++---- R/vcf_to_fbm.R | 9 ++++----- man/gen_tibble.Rd | 7 ++++--- man/vcf_to_fbm.Rd | 5 ++--- tests/testthat/test_gen_tibble.R | 6 ++++-- 7 files changed, 40 insertions(+), 25 deletions(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index e6790977..9d53ffe4 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -28,8 +28,9 @@ #' and 'position','genetic_dist', 'allele_ref' and 'allele_alt'. This is only used #' if `x` is a genotype matrix. Otherwise this information is extracted directly from #' the files. -#' @param loci_per_chunk the number of loci processed at a time (currently only used -#' if `x` is a vcf file) +#' @param chunk_size the number of loci or individuals (depending on the format) +#' processed at a time (currently used +#' if `x` is a vcf or packedancestry file) #' @param ... if `x` is the name of a vcf file, additional arguments #' passed to [vcfR::read.vcfR()]. Otherwise, unused. #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T', @@ -69,7 +70,7 @@ gen_tibble <- #' @rdname gen_tibble gen_tibble.character <- function(x, - ..., loci_per_chunk = 10000, + ..., chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0","."), backingfile = NULL, @@ -88,7 +89,7 @@ gen_tibble.character <- backingfile = backingfile, quiet = quiet) } else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){ - x_gt <- gen_tibble_vcf(x = x, ..., loci_per_chunk = loci_per_chunk, + x_gt <- gen_tibble_vcf(x = x, ..., chunk_size = chunk_size, valid_alleles= valid_alleles, missing_alleles= missing_alleles, backingfile = backingfile, quiet = quiet) diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R index ead68beb..6748a3dc 100644 --- a/R/gen_tibble_packedancestry.R +++ b/R/gen_tibble_packedancestry.R @@ -2,8 +2,12 @@ gen_tibble_packedancestry <- function(x, ..., valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0","."), - chunk_size = 100, + chunk_size = NULL, backingfile = NULL, quiet = FALSE) { + if (is.null(chunk_size)){ + chunk_size <- 500 + } + # Substitute .ped with .map map_file <- sub("\\.geno$", ".snp", x) if (!file.exists(map_file)){ @@ -23,12 +27,12 @@ gen_tibble_packedancestry <- function(x, ..., # read the packed ancestry in chunks # count the individuals - indiv_table <- read.table(ind_file, header = FALSE) + indiv_table <- utils::read.table(ind_file, header = FALSE) names(indiv_table) <- c("id","sex","population") no_individuals <- nrow(indiv_table) # count the loci - loci_table <- read.table(map_file, header = FALSE) + loci_table <- utils::read.table(map_file, header = FALSE) names(loci_table)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt') loci_table$allele_alt[loci_table$allele_alt == "X"] <- NA no_variants <- nrow(loci_table) @@ -90,6 +94,6 @@ gen_tibble_packedancestry <- function(x, ..., valid_alleles = valid_alleles, missing_alleles = missing_alleles, backingfile = backingfile, - quiet = quiet) + quiet = TRUE) # silence it as output will be generated by the parent function } diff --git a/R/gen_tibble_vcf.R b/R/gen_tibble_vcf.R index c8b52551..1b9de181 100644 --- a/R/gen_tibble_vcf.R +++ b/R/gen_tibble_vcf.R @@ -1,10 +1,19 @@ # read in a vcf gen_tibble_vcf <- function(x, ..., - loci_per_chunk = 100000, + chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0","."), backingfile = NULL, quiet = FALSE) { - rds_path <- vcf_to_fbm(x,backingfile = backingfile,loci_per_chunk = loci_per_chunk, quiet = quiet) - gen_tibble(rds_path, valid_alleles = valid_alleles, missing_alleles = missing_alleles, - backingfile = backingfile, quiet = quiet) + if (is.null(chunk_size)){ + chunk_size <- 10000 + } + rds_path <- vcf_to_fbm(x, + backingfile = backingfile, + chunk_size = chunk_size, + quiet = quiet) + gen_tibble(rds_path, + valid_alleles = valid_alleles, + missing_alleles = missing_alleles, + backingfile = backingfile, + quiet = quiet) } diff --git a/R/vcf_to_fbm.R b/R/vcf_to_fbm.R index 1877a8e9..e7383df5 100644 --- a/R/vcf_to_fbm.R +++ b/R/vcf_to_fbm.R @@ -2,17 +2,16 @@ #' #' Convert a vcf file to a Filebacked Big Matrix (FBM) object. #' This should work even for large vcf files that would not fit in memory. -#' TODO: this function is not yet complete. #' #' @param vcf_path the path to the vcf -#' @param loci_per_chunk the chunk size to use on the vcf when loading the file +#' @param chunk_size the chunk size to use on the vcf when loading the file #' @param backingfile the name of the file to use as the backing file #' @return path to the resulting rds file as class bigSNP. #' @keywords internal vcf_to_fbm <- function( vcf_path, - loci_per_chunk = 100000, + chunk_size = 100000, backingfile = NULL, quiet=FALSE) { @@ -32,8 +31,8 @@ vcf_to_fbm <- function( no_individuals <- count_vcf_individuals(vcf_path) # set up chunks chunks_vec <- c( - rep(loci_per_chunk, floor(no_variants / loci_per_chunk)), - no_variants %% loci_per_chunk + rep(chunk_size, floor(no_variants / chunk_size)), + no_variants %% chunk_size ) chunks_vec_index <- c(1, chunks_vec) diff --git a/man/gen_tibble.Rd b/man/gen_tibble.Rd index b1d08beb..7707c3a7 100644 --- a/man/gen_tibble.Rd +++ b/man/gen_tibble.Rd @@ -18,7 +18,7 @@ gen_tibble( \method{gen_tibble}{character}( x, ..., - loci_per_chunk = 10000, + chunk_size = NULL, valid_alleles = c("A", "T", "C", "G"), missing_alleles = c("0", "."), backingfile = NULL, @@ -77,8 +77,9 @@ the session!)} \item{quiet}{provide information on the files used to store the data} -\item{loci_per_chunk}{the number of loci processed at a time (currently only used -if \code{x} is a vcf file)} +\item{chunk_size}{the number of loci or individuals (depending on the format) +processed at a time (currently used +if \code{x} is a vcf or packedancestry file)} \item{indiv_meta}{a list, data.frame or tibble with compulsory columns 'id' and 'population', plus any additional metadata of interest. This is only used diff --git a/man/vcf_to_fbm.Rd b/man/vcf_to_fbm.Rd index d9550174..bfa7b65c 100644 --- a/man/vcf_to_fbm.Rd +++ b/man/vcf_to_fbm.Rd @@ -4,12 +4,12 @@ \alias{vcf_to_fbm} \title{Convert vcf to FBM.} \usage{ -vcf_to_fbm(vcf_path, loci_per_chunk = 1e+05, backingfile = NULL, quiet = FALSE) +vcf_to_fbm(vcf_path, chunk_size = 1e+05, backingfile = NULL, quiet = FALSE) } \arguments{ \item{vcf_path}{the path to the vcf} -\item{loci_per_chunk}{the chunk size to use on the vcf when loading the file} +\item{chunk_size}{the chunk size to use on the vcf when loading the file} \item{backingfile}{the name of the file to use as the backing file} } @@ -19,6 +19,5 @@ path to the resulting rds file as class bigSNP. \description{ Convert a vcf file to a Filebacked Big Matrix (FBM) object. This should work even for large vcf files that would not fit in memory. -TODO: this function is not yet complete. } \keyword{internal} diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 127b0a84..4be947f0 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -189,7 +189,8 @@ test_that("gen_tibble from files",{ pop_a_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile()) expect_true(all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_vcf_gt))) # reload it in chunks - pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), loci_per_chunk=2) + pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + chunk_size = 2) expect_true(all.equal(show_genotypes(pop_a_vcf_gt2),show_genotypes(pop_a_vcf_gt))) expect_true(all.equal(show_loci(pop_a_vcf_gt2),show_loci(pop_a_vcf_gt))) @@ -220,7 +221,8 @@ test_that("gen_tibble from files with missingness",{ pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile()) expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_gt))) # reload it in chunks - pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), loci_per_chunk=2) + pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + chunk_size = 2) expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt))) expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt))) From 34c15a3b70561498355e0fc0682282aba2c90e45 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 12 Jun 2024 11:27:06 +0100 Subject: [PATCH 3/5] default to 100 samples in packedancestry --- R/gen_tibble_packedancestry.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R index 6748a3dc..4e6801d6 100644 --- a/R/gen_tibble_packedancestry.R +++ b/R/gen_tibble_packedancestry.R @@ -5,7 +5,7 @@ gen_tibble_packedancestry <- function(x, ..., chunk_size = NULL, backingfile = NULL, quiet = FALSE) { if (is.null(chunk_size)){ - chunk_size <- 500 + chunk_size <- 100 } # Substitute .ped with .map From 9b9a2df027abf33d27e57d8d246e84eb0cd3915e Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 12 Jun 2024 11:30:54 +0100 Subject: [PATCH 4/5] Don't build website on pull request --- .github/workflows/pkgdown.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index c9f0165d..4755ea9e 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -2,9 +2,7 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] - pull_request: - branches: [main, master] + branches: [main] release: types: [published] workflow_dispatch: From a21956fb07368ebb7711d55ec4fc9108d19b7f2c Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Wed, 12 Jun 2024 11:31:16 +0100 Subject: [PATCH 5/5] bump version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7fa8568e..2d912cfe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: tidypopgen Title: Tidy Population Genetics -Version: 0.0.0.9013 +Version: 0.0.0.9014 Authors@R: c(person("Evie", "Carter", role = c("aut")), person("Andrea", "Manica", email = "am315@cam.ac.uk",