Skip to content

Commit

Permalink
Merge pull request #44 from EvolEcolGroup/packedancestry
Browse files Browse the repository at this point in the history
Packedancestry
  • Loading branch information
dramanica authored Jun 12, 2024
2 parents 3ee0e55 + a21956f commit 89fc142
Show file tree
Hide file tree
Showing 10 changed files with 178 additions and 56 deletions.
4 changes: 1 addition & 3 deletions .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
branches: [main, master]
pull_request:
branches: [main, master]
branches: [main]
release:
types: [published]
workflow_dispatch:
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tidypopgen
Title: Tidy Population Genetics
Version: 0.0.0.9013
Version: 0.0.0.9014
Authors@R:
c(person("Evie", "Carter", role = c("aut")),
person("Andrea", "Manica", email = "[email protected]",
Expand Down
32 changes: 20 additions & 12 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@
#' and 'position','genetic_dist', 'allele_ref' and 'allele_alt'. This is only used
#' if `x` is a genotype matrix. Otherwise this information is extracted directly from
#' the files.
#' @param loci_per_chunk the number of loci processed at a time (currently only used
#' if `x` is a vcf file)
#' @param chunk_size the number of loci or individuals (depending on the format)
#' processed at a time (currently used
#' if `x` is a vcf or packedancestry file)
#' @param ... if `x` is the name of a vcf file, additional arguments
#' passed to [vcfR::read.vcfR()]. Otherwise, unused.
#' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
Expand Down Expand Up @@ -69,7 +70,7 @@ gen_tibble <-
#' @rdname gen_tibble
gen_tibble.character <-
function(x,
..., loci_per_chunk = 10000,
..., chunk_size = NULL,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL,
Expand All @@ -88,7 +89,7 @@ gen_tibble.character <-
backingfile = backingfile,
quiet = quiet)
} else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
x_gt <- gen_tibble_vcf(x = x, ..., loci_per_chunk = loci_per_chunk,
x_gt <- gen_tibble_vcf(x = x, ..., chunk_size = chunk_size,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile, quiet = quiet)
Expand Down Expand Up @@ -155,15 +156,20 @@ gen_tibble_bed_rds <- function(x, ...,
indiv_meta$maternal_ID <- fam_info$maternal.ID
indiv_meta$maternal_ID[indiv_meta$maternal_ID==0]<-NA
}
if(!all(fam_info$sex==0)){
indiv_meta$sex <- dplyr::case_match(
fam_info$sex,
1 ~ "male",
2 ~ "female",
.default = NA,
.ptype = factor(levels = c("female", "male"))
)
# if sex is numeric
if (inherits(fam_info$sex,"numeric")){
if(!all(fam_info$sex==0)){
indiv_meta$sex <- dplyr::case_match(
fam_info$sex,
1 ~ "male",
2 ~ "female",
.default = NA,
.ptype = factor(levels = c("female", "male"))
)
}
}

if (inherits(fam_info$affection,"numeric")){
if(!all(fam_info$affection %in% c(0,-9))){
indiv_meta$phenotype <- dplyr::case_match(
fam_info$affection,
Expand All @@ -174,6 +180,8 @@ gen_tibble_bed_rds <- function(x, ...,
.ptype = factor(levels = c("control", "case"))
)
}
}

new_gen_tbl <- tibble::new_tibble(
indiv_meta,
class = "gen_tbl"
Expand Down
88 changes: 68 additions & 20 deletions R/gen_tibble_packedancestry.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
gen_tibble_packedancestry <- function(x, ...,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
chunk_size = NULL,
backingfile = NULL, quiet = FALSE) {
if (is.null(chunk_size)){
chunk_size <- 100
}

# Substitute .ped with .map
map_file <- sub("\\.geno$", ".snp", x)
if (!file.exists(map_file)){
Expand All @@ -20,32 +25,75 @@ gen_tibble_packedancestry <- function(x, ...,
backingfile <- bigstatsr::sub_bk(backingfile,"")
}

res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x),
# read the packed ancestry in chunks
# count the individuals
indiv_table <- utils::read.table(ind_file, header = FALSE)
names(indiv_table) <- c("id","sex","population")
no_individuals <- nrow(indiv_table)

# count the loci
loci_table <- utils::read.table(map_file, header = FALSE)
names(loci_table)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')
loci_table$allele_alt[loci_table$allele_alt == "X"] <- NA
no_variants <- nrow(loci_table)

# create a matrix to store the data
file_backed_matrix <- bigstatsr::FBM.code256(
nrow = no_individuals,
ncol = no_variants,
code = bigsnpr::CODE_012,
backingfile = backingfile
)


# set up chunks
chunks <- split(1:no_individuals, ceiling(seq_along(1:no_individuals)/chunk_size))

for (i in chunks) {
res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x),
transpose = TRUE,
...)
names(res$ind)<-c("id","sex","population")
#TODO check that allele_ref and allele_alt are not swapped
names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')
inds = indiv_table$id[i],
..., verbose = !quiet)
res$geno[is.na(res$geno)]<-3
#browser()
# now insert the genotypes in the FBM
file_backed_matrix[
i,

] <- res$geno
}

allele_alt <- res$snp$allele_ref
allele_ref <- res$snp$allele_alt
# save the fbm
file_backed_matrix$save()

loci <- cbind(res$snp[,c(1:4)], allele_ref, allele_alt)
# convert the info from the indiv and loci table
fam <- tibble(family.ID = indiv_table$population,
sample.ID = indiv_table$id,
paternal.ID = 0,
maternal.ID = 0,
sex = 0, # this needs fixing!
affection = 0,
ploidy = 2)

xs <- which(loci$allele_ref == "X")
map <- tibble(chromosome = loci_table$chromosome,
marker.ID = loci_table$name,
genetic.dist = loci_table$genetic_dist,
physical.pos = loci_table$position,
allele1 = loci_table$allele_ref,
allele2 = loci_table$allele_alt)

loci[xs, "allele_ref"] <- NA
bigsnp_obj <- structure(list(
genotypes = file_backed_matrix,
fam = fam,
map = map
), class = "bigSNP")

# using the gen_tibble.matrix method
new_gen_tbl <- gen_tibble(x = res$geno,
indiv_meta = res$ind,
#loci = res$snp,
loci = loci,
backingfile = backingfile,
quiet=quiet,
valid_alleles = valid_alleles,
missing_alleles = missing_alleles)
bigsnp_obj <- bigsnpr::snp_save(bigsnp_obj)

return(new_gen_tbl)
gen_tibble( bigsnp_obj$genotypes$rds,
valid_alleles = valid_alleles,
missing_alleles = missing_alleles,
backingfile = backingfile,
quiet = TRUE) # silence it as output will be generated by the parent function

}
50 changes: 50 additions & 0 deletions R/gen_tibble_packedancestry_old.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#A function to read geno packedancestrymap files
gen_tibble_packedancestry_old <- function(x, ...,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL, quiet = FALSE) {
# Substitute .ped with .map
map_file <- sub("\\.geno$", ".snp", x)
if (!file.exists(map_file)){
stop("snp file ",map_file," does not exist")
}
ind_file <- sub("\\.geno$", ".ind", x)
if (!file.exists(ind_file)){
stop("ind file ",ind_file," does not exist")
}

if (is.null(backingfile)){
backingfile <- sub("\\.geno$", "", x)
}
if (file_ext(backingfile)=="bk"){
backingfile <- bigstatsr::sub_bk(backingfile,"")
}

res <- admixtools::read_packedancestrymap(sub("\\.geno$", "", x),
transpose = TRUE,
..., verbose = !quiet)
names(res$ind)<-c("id","sex","population")
#TODO check that allele_ref and allele_alt are not swapped
names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')

allele_alt <- res$snp$allele_ref
allele_ref <- res$snp$allele_alt

loci <- cbind(res$snp[,c(1:4)], allele_ref, allele_alt)

xs <- which(loci$allele_ref == "X")

loci[xs, "allele_ref"] <- NA

# using the gen_tibble.matrix method
new_gen_tbl <- gen_tibble(x = res$geno,
indiv_meta = res$ind,
loci = loci,
backingfile = backingfile,
quiet=quiet,
valid_alleles = valid_alleles,
missing_alleles = missing_alleles)

return(new_gen_tbl)

}
17 changes: 13 additions & 4 deletions R/gen_tibble_vcf.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
# read in a vcf
gen_tibble_vcf <- function(x, ...,
loci_per_chunk = 100000,
chunk_size = NULL,
valid_alleles = c("A", "T", "C", "G"),
missing_alleles = c("0","."),
backingfile = NULL, quiet = FALSE) {
rds_path <- vcf_to_fbm(x,backingfile = backingfile,loci_per_chunk = loci_per_chunk, quiet = quiet)
gen_tibble(rds_path, valid_alleles = valid_alleles, missing_alleles = missing_alleles,
backingfile = backingfile, quiet = quiet)
if (is.null(chunk_size)){
chunk_size <- 10000
}
rds_path <- vcf_to_fbm(x,
backingfile = backingfile,
chunk_size = chunk_size,
quiet = quiet)
gen_tibble(rds_path,
valid_alleles = valid_alleles,
missing_alleles = missing_alleles,
backingfile = backingfile,
quiet = quiet)
}
9 changes: 4 additions & 5 deletions R/vcf_to_fbm.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
#'
#' Convert a vcf file to a Filebacked Big Matrix (FBM) object.
#' This should work even for large vcf files that would not fit in memory.
#' TODO: this function is not yet complete.
#'
#' @param vcf_path the path to the vcf
#' @param loci_per_chunk the chunk size to use on the vcf when loading the file
#' @param chunk_size the chunk size to use on the vcf when loading the file
#' @param backingfile the name of the file to use as the backing file
#' @return path to the resulting rds file as class bigSNP.
#' @keywords internal

vcf_to_fbm <- function(
vcf_path,
loci_per_chunk = 100000,
chunk_size = 100000,
backingfile = NULL,
quiet=FALSE) {

Expand All @@ -32,8 +31,8 @@ vcf_to_fbm <- function(
no_individuals <- count_vcf_individuals(vcf_path)
# set up chunks
chunks_vec <- c(
rep(loci_per_chunk, floor(no_variants / loci_per_chunk)),
no_variants %% loci_per_chunk
rep(chunk_size, floor(no_variants / chunk_size)),
no_variants %% chunk_size
)
chunks_vec_index <- c(1, chunks_vec)

Expand Down
7 changes: 4 additions & 3 deletions man/gen_tibble.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions man/vcf_to_fbm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 15 additions & 5 deletions tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,14 +160,19 @@ test_that("gen_tibble identifies wrong loci table columns",{


test_that("gen_tibble from files",{
########################
# PLINK BED files
########################
bed_path <- system.file("extdata/pop_a.bed", package = "tidypopgen")
pop_a_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile())
# now read the dosages created by plink when saving in raw format
raw_file_pop_a <- read.table(system.file("extdata/pop_a.raw", package = "tidypopgen"), header= TRUE)
mat <- as.matrix(raw_file_pop_a[,7:ncol(raw_file_pop_a)])
mat <- unname(mat)
expect_true(all.equal(mat,show_genotypes(pop_a_gt)))
# now read in the ped file
########################
# PLINK PED files
########################
ped_path <- system.file("extdata/pop_a.ped", package = "tidypopgen")
pop_a_ped_gt <- gen_tibble(ped_path, quiet=TRUE,backingfile = tempfile())
# because ref and alt are defined based on which occurs first in a ped, some alleles will be swapped
Expand All @@ -177,15 +182,19 @@ test_that("gen_tibble from files",{
expect_true(all(show_loci(pop_a_gt)$allele_alt[not_equal] == show_loci(pop_a_ped_gt)$allele_ref[not_equal]))
# check that the mismatches are all in the homozygotes
expect_true(all(abs(show_genotypes(pop_a_gt)[, not_equal]-show_genotypes(pop_a_ped_gt)[, not_equal]) %in% c(0,2)))
# now read in vcf
########################
# PLINK VCF files
########################
vcf_path <- system.file("extdata/pop_a.vcf", package = "tidypopgen")
pop_a_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile())
expect_true(all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_vcf_gt)))
# reload it in chunks
pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), loci_per_chunk=2)
pop_a_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
chunk_size = 2)
expect_true(all.equal(show_genotypes(pop_a_vcf_gt2),show_genotypes(pop_a_vcf_gt)))
expect_true(all.equal(show_loci(pop_a_vcf_gt2),show_loci(pop_a_vcf_gt)))
# we should add similar tests for pop b, which has missing data

# @TODO we should add similar tests for pop b, which has missing data

})

Expand All @@ -212,7 +221,8 @@ test_that("gen_tibble from files with missingness",{
pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile())
expect_true(all.equal(show_genotypes(pop_b_gt),show_genotypes(pop_b_vcf_gt)))
# reload it in chunks
pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), loci_per_chunk=2)
pop_b_vcf_gt2 <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(),
chunk_size = 2)
expect_true(all.equal(show_genotypes(pop_b_vcf_gt2),show_genotypes(pop_b_vcf_gt)))
expect_true(all.equal(show_loci(pop_b_vcf_gt2),show_loci(pop_b_vcf_gt)))

Expand Down

0 comments on commit 89fc142

Please sign in to comment.