Skip to content

Commit

Permalink
Merge pull request #34 from EvolEcolGroup/write_ped
Browse files Browse the repository at this point in the history
Write ped
  • Loading branch information
dramanica authored May 10, 2024
2 parents 3b4f9e6 + 5f73bd9 commit 9cadb13
Show file tree
Hide file tree
Showing 14 changed files with 238 additions and 50 deletions.
31 changes: 31 additions & 0 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,42 @@ gen_tibble_bed_rds <- function(x, ...,

indiv_meta <- list(id = bigsnp_obj$fam$sample.ID,
population = bigsnp_obj$fam$family.ID)
# TODO check if other elements are informative; if they are, bring them over


indiv_meta$genotypes <- new_vctrs_bigsnp(bigsnp_obj,
bigsnp_file = bigsnp_path,
indiv_id = bigsnp_obj$fam$sample.ID)

# transfer some of the fam info to the metadata table if it is not missing (0 is the default missing value)
fam_info <- .gt_get_bigsnp(indiv_meta)$fam
if(!all(fam_info$paternal.id==0)){
indiv_meta$paternal_ID <- fam_info$paternal.id
indiv_meta$paternal_ID[indiv_meta$paternal_ID==0]<-NA
}
if(!all(fam_info$maternal.id==0)){
indiv_meta$maternal_ID <- fam_info$maternal.id
indiv_meta$maternal_ID[indiv_meta$maternal_ID==0]<-NA
}
if(!all(fam_info$sex==0)){
indiv_meta$sex <- dplyr::case_match(
fam_info$sex,
1 ~ "male",
2 ~ "female",
.default = NA,
.ptype = factor(levels = c("female", "male"))
)
}
if(!all(fam_info$affection %in% c(0,-9))){
indiv_meta$phenotype <- dplyr::case_match(
fam_info$affection,
1 ~ "control",
2 ~ "case",
-9 ~ NA,
.default = NA,
.ptype = factor(levels = c("control", "case"))
)
}
new_gen_tbl <- tibble::new_tibble(
indiv_meta,
class = "gen_tbl"
Expand Down
6 changes: 3 additions & 3 deletions R/gen_tibble_ped.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,10 @@ read.pedfile <- function(file, n, snps, which, split="\t| +", sep=".",
# r2 <- as.raw(2)
# r3 <- as.raw(3)

r0 <- 3
r1 <- 2
r0 <- NA_integer_
r1 <- 0
r2 <- 1
r3 <- 0
r3 <- 2

## Input file
con <- gzfile(file)
Expand Down
161 changes: 146 additions & 15 deletions R/gt_as_plink.R
Original file line number Diff line number Diff line change
@@ -1,38 +1,169 @@
#' Export a `gen_tibble` object to PLINK bed format
#'
#' This function exports all the information of a `gen_tibble` object into
#' a PLINK bed file.
#' a PLINK bed, ped or raw file (and associated files, i.e. .bim and .fam for .bed;
#' .fam for .ped).
#'
#' @param x a [`gen_tibble`] object
#' @param bedfile a character string giving the path to output bed file.
#' @param file a character string giving the path to output file. If left to NULL,
#' the output file will have the same path and prefix of the backingfile.
#' @param type one of "bed", "ped" or "raw"
#' @param overwrite boolean whether to overwrite the file.
#' @returns TRUE if successful
#' @returns the path of the saved file
#' @export


gt_as_plink <- function(x, bedfile = NULL,
gt_as_plink <- function(x, file = NULL, type = c("bed","ped","raw"),
overwrite = TRUE){
if (is.null(bedfile)){
bedfile <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,".bed")
type <- match.arg(type)

if (is.null(file)){
file <- bigstatsr::sub_bk(attr(x$genotypes,"bigsnp")$genotypes$backingfile,paste0(".",type))
}
if (file_ext(file)!=type){
file <- paste0(file,".",type)
}
if (file_ext(bedfile)!="bed"){
bedfile <- paste0(bedfile,".bed")

if (type=="bed"){
all_files <- c(file,
bigsnpr::sub_bed(file,".bim"),
bigsnpr::sub_bed(file,".fam")
)
} else if (type=="ped"){
all_files <- c(file,
gsub(".ped",".fam",file))
} else if (type=="raw"){
all_files <- file
}
if (file.exists(bedfile)){

if (any(file.exists(all_files))){
if (overwrite){
file.remove(bedfile)
file.remove(bigsnpr::sub_bed(bedfile,".bim"))
file.remove(bigsnpr::sub_bed(bedfile,".fam"))
} else {
stop(bedfile," already exists; remove if first or set 'overwrite' = TRUE")
file.remove(all_files[file.exists(all_files)])
} else {
stop("at least one of", all_files," already exists; remove if first or set 'overwrite' = TRUE")
}
}

if (type == "bed"){
gt_write_bed(x, file)
} else {
gt_write_ped_raw(x,file,type)
}
}


gt_write_bed <- function(x, file) {
bed_path <- bigsnpr::snp_writeBed(attr(x$genotypes,"bigsnp"),
bedfile = bedfile,
bedfile = file,
ind.row = vctrs::vec_data(x$genotypes),
ind.col = show_loci(x)$big_index)
# the bim and fam file only contain the original information in the bigSNP object
# TODO we should update them with the info from the gentibble
bed_path
}



gt_write_ped_raw <- function(x, file, plink_format = c("raw","ped"), chunk_size = 10000){

# loci information
loci <- show_loci(x)
# replace missing value with zero
loci$allele_alt[is.na(loci$allele_alt)]<-"0"

# create col names to use in the raw file
raw_col_names<- c("FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE",
paste0(loci$name,"_",toupper(loci$allele_alt),"(/",toupper(loci$allele_ref),")"))

# create the info for the fam file
indiv_meta <- tibble(population = x$population,
id = x$id,
pat = pull_NA(x, "pat"),
mat = pull_NA(x, "mat"),
sex = pull_NA(x, "sex"),
phenotype = pull_NA(x, "phenotype")
)
# recode some variables
indiv_meta$sex<- dplyr::case_match(as.character(indiv_meta$sex),'female'~'2','male'~'1',.default="0")
indiv_meta$pat[is.na(indiv_meta$pat)]<-0
indiv_meta$mat[is.na(indiv_meta$mat)]<-0
indiv_meta$phenotype<-dplyr::case_match(as.character(indiv_meta$phenotype),
'control'~"1",'case'~"2", .default=indiv_meta$phenotype )
indiv_meta$phenotype[is.na(indiv_meta$phenotype)]<--9

# create chunks
n_ind <- nrow(x)
chunks <- split(1:n_ind, ceiling(seq_along(1:n_ind)/chunk_size))

# loop over to write
for (i_chunk in seq_len(length(chunks))){
chunk <- chunks[[i_chunk]]
raw_table <- cbind(indiv_meta[chunk,],
show_genotypes(x[chunk,]))

# now recode the genotypes with letters if raw
if (plink_format=="ped"){
for (i in 1:(ncol(raw_table)-6)){
raw_table[,i+6]<-recode_genotype(raw_table[,i+6], loci$allele_ref[i], loci$allele_alt[i])
}
}

colnames(raw_table) <- raw_col_names
# append column names only the first time, when the file does not exist
utils::write.table(raw_table,
file=file,
sep = " ",
row.names = FALSE,
col.names=(!file.exists(file) & plink_format=="raw"),
append = file.exists(file),
quote = FALSE)

}

## create info for the map file
loci_meta <- show_loci(x)
map_file <- paste0(substr(file, 1, nchar(file)-4),".map")
map_table <- tibble(chrom = pull_NA(loci_meta,"chromosome"),
id = pull_NA(loci_meta,"name"),
cM = pull_NA(loci_meta,"cM"),
position = pull_NA(loci_meta,"position"))
map_table[is.na(map_table)]<-0

utils::write.table(
map_table,
file = map_file,
row.names = FALSE,
col.names = FALSE,
quote = FALSE
)
return(file)
}


# internal function returning either a vector from a given column or NA if it does not exist
pull_NA <- function(.x, .col_name) {
if (.col_name %in% names(.x)){
return(.x %>% pull(.col_name))
} else {
return(rep(NA,nrow(.x)))
}
}

# recode dosage as letter genotypes
recode_genotype <- function(x, allele_ref, allele_alt){
x <-as.character(x)
genotypes <- c(paste(allele_ref, allele_ref),
paste(allele_ref, allele_alt),
paste(allele_alt, allele_alt))
x <- dplyr::case_match(
x,
"0" ~ genotypes[1],
"1" ~ genotypes[2],
"2" ~ genotypes[3]
)
x[is.na(x)]<-c("0 0")
x

}


Binary file added inst/extdata/pop_a.bk
Binary file not shown.
5 changes: 5 additions & 0 deletions inst/extdata/pop_a.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pop_a GRC14300079 0 0 1 -9 A A A A G G A A G G T G G G C C T T C C T T G G T T A G T T T T
pop_a GRC14300142 0 0 1 -9 A A A A G G A A G G G G G G C C T T G C A T G G T T G G C T T T
pop_a GRC14300159 0 0 1 -9 A A G A A G A A A A T G G G C C T T C C T T G G T T A G T T T T
pop_a GRC14300286 0 0 1 -9 A A A A G G A A G G G G G G C C T T G C T T G G T T A G C T T T
pop_a GRC14300305 0 0 1 -9 A A A A A G A A A G T T G G C C T T C C T T G G T T A G T T T T
Binary file added inst/extdata/pop_a.rds
Binary file not shown.
12 changes: 8 additions & 4 deletions man/gt_as_plink.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 4 additions & 4 deletions src/snp_as.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
size_t n = macc.nrow();
size_t m = macc.ncol();

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < m; j++) {
for (unsigned int i = 0; i < n; i++) {
int value = (macc(i,j));
if (value <3){
dos_mat(i, j) = value-1; // note that we want the matrix of (dos-1)
Expand All @@ -53,10 +53,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
}}

// fill the rest with 0s (should be 1 column max)
int m2 = dos_mat.n_cols;
size_t m2 = dos_mat.n_cols;
if (m2 > m) {
myassert(m2 == (m + 1), ERROR_BUG);
for (int i = 0; i < n; i++) {
for (unsigned int i = 0; i < n; i++) {
dos_mat(i, m) = 1;
na_mat(i, m) = 0;
}
Expand Down
8 changes: 4 additions & 4 deletions src/snp_ibs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
size_t n = macc.nrow();
size_t m = macc.ncol();

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < m; j++) {
for (unsigned int i = 0; i < n; i++) {
int value = (macc(i,j));
if (value == 0){
genotype0(i, j) = 1;
Expand All @@ -55,10 +55,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
}}

// fill the rest with 0s (should be 1 column max)
int m2 = genotype0.n_cols;
size_t m2 = genotype0.n_cols;
if (m2 > m) {
myassert(m2 == (m + 1), ERROR_BUG);
for (int i = 0; i < n; i++) {
for (unsigned int i = 0; i < n; i++) {
genotype0(i, m) = 0;
genotype1(i, m) = 0;
genotype2(i, m) = 0;
Expand Down
8 changes: 4 additions & 4 deletions src/snp_king.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
size_t n = macc.nrow();
size_t m = macc.ncol();

for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < m; j++) {
for (unsigned int i = 0; i < n; i++) {
int value = (macc(i,j));
if (value == 0){
genotype0(i, j) = 1;
Expand All @@ -58,10 +58,10 @@ inline arma::mat FBM_RW2arma(Rcpp::Environment BM) {
}}

// fill the rest with 0s (should be 1 column max)
int m2 = genotype0.n_cols;
size_t m2 = genotype0.n_cols;
if (m2 > m) {
myassert(m2 == (m + 1), ERROR_BUG);
for (int i = 0; i < n; i++) {
for (unsigned int i = 0; i < n; i++) {
genotype0(i, m) = 0;
genotype1(i, m) = 0;
genotype2(i, m) = 0;
Expand Down
26 changes: 14 additions & 12 deletions tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_loci <- data.frame(name=paste0("rs",1:6),
test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE)

# this also tests show_genotypes and show_loci
test_that("create gen_tibble from bed",{
test_that("create gen_tibble from dfs",{
expect_true(inherits(test_gt,"gen_tbl"))
# we can extract the genotypes correctly
extracted_genotypes <- test_gt %>% show_genotypes()
Expand All @@ -32,17 +32,6 @@ test_that("create gen_tibble from bed",{

})

# now create it directly from the dfs
test_that("create gen_tibble from dfs",{
test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
loci = test_loci, quiet = TRUE)
# because of the different backing file info, we cannot use identical on the whole object
expect_true(identical(show_genotypes(test_gt), show_genotypes(test_dfs_gt)))
expect_true(identical(show_loci(test_gt), show_loci(test_dfs_gt)))
expect_true(identical(test_gt %>% select(-genotypes),
test_dfs_gt %>% select(-genotypes)))
})

test_genotypes_c <- rbind(c("1","1","0","1","1","0"),
c("2","1","0","0","0","0"),
c("2","2","0","0","1","1"))
Expand Down Expand Up @@ -81,4 +70,17 @@ test_that("gen_tibble catches invalid alleles",{

})

test_that("gen_tibble from a bed file",{
bed_path <- system.file("extdata/pop_a.bed", package = "tidypopgen")
pop_a_gt <- gen_tibble(bed_path, quiet=TRUE, backingfile = tempfile())
# now read the dosages created by plink when saving in raw format
raw_file_pop_a <- read.table(system.file("extdata/pop_a.raw", package = "tidypopgen"), header= TRUE)
mat <- as.matrix(raw_file_pop_a[,7:ncol(raw_file_pop_a)])
mat <- unname(mat)
expect_true(all.equal(mat,show_genotypes(pop_a_gt)))
# now read in the ped file
ped_path <- system.file("extdata/pop_a.ped", package = "tidypopgen")
pop_a_ped_gt <- gen_tibble(ped_path, quiet=TRUE,backingfile = tempfile())
all.equal(show_genotypes(pop_a_gt),show_genotypes(pop_a_ped_gt))

})
Loading

0 comments on commit 9cadb13

Please sign in to comment.