Skip to content

Commit

Permalink
Merge branch 'main' into documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
dramanica committed Dec 2, 2024
2 parents 095794c + 333b56a commit 6bbdd7b
Show file tree
Hide file tree
Showing 24 changed files with 269 additions and 767 deletions.
84 changes: 34 additions & 50 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
#' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
#' here the format
#'
#' - *VCF* files: the fast `cpp` parser is used by default. It attempts to establish
#' ploidy from the first variant; if that variant is found in a sex chromosome (or mtDNA),
#' it will fail. To use the fast parser, change the order of variants so that the first chromosome is an
#' autosome using a tool such as `vcftools`. Alternatively, the slower but more robust
#' `vcfR` parser should be used instead. Currently, only biallelic SNPs are supported. If haploid
#' variants (e.g. sex chromosomes) are included in the vcf, they are not transformed into homozygous calls.
#' Instad, reference alleles will be counted as 0 and alternative alleles will be counted as 1.
#'- *VCF* files: the fast `cpp` parser is used by default. Both `cpp` and `vcfR` parsers
#' attempt to establish ploidy from the first variant; if that variant is found in a
#' sex chromosome (or mtDNA), the parser will fail with 'Error: a genotype has more
#' than max_ploidy alleles...'. To successful import such a *VCF*, change the order of variants
#' so that the first chromosome is an autosome using a tool such as `vcftools`.
#' Currently, only biallelic SNPs are supported. If haploid variants (e.g. sex
#' chromosomes) are included in the *VCF*, they are not transformed into homozygous
#' calls. Instead, reference alleles will be counted as 0 and alternative alleles
#' will be counted as 1.
#'
#' - *packedancestry* files: When loading *packedancestry* files, missing alleles will be converted from
#' 'X' to NA
Expand All @@ -21,9 +23,9 @@
#' - a string giving the path to a RDS file storing a `bigSNP` object from
#' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
#' - a string giving the path to a vcf file. Note that we currently read the whole
#' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic
#' vcf in memory with `vcfR`, so only smallish *VCF* can be imported. Only biallelic
#' SNPs will be considered.
#' - a string giving the path to a packedancestry .geno file. The associated
#' - a string giving the path to a *packedancestry* .geno file. The associated
#' .ind and .snp files are expected to be in the same directory and share the
#' same file name prefix.
#' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
Expand All @@ -41,9 +43,9 @@
#' if `x` is a vcf or packedancestry file)
#' @param ... if `x` is the name of a vcf file, additional arguments
#' passed to [vcfR::read.vcfR()]. Otherwise, unused.
#' @param parser the name of the parser used for VCF, either "cpp" to use
#' @param parser the name of the parser used for *VCF*, either "cpp" to use
#' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
#' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
#' but more robust; if "cpp" gives error, try using "vcfR" in case your *VCF* has
#' an unusual structure.
#' @param n_cores the number of cores to use for parallel processing
#' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
Expand All @@ -54,10 +56,10 @@
#' @param backingfile the path, including the file name without extension,
#' for backing files used to store the data (they will be given a .bk
#' and .RDS automatically). This is not needed if `x` is already an .RDS file.
#' If `x` is a .BED file and `backingfile` is left NULL, the backing file will
#' If `x` is a .BED or a *VCF* file and `backingfile` is left NULL, the backing file will
#' be saved in the same directory as the
#' bed file, using the same file name but with a different file type (.bk rather
#' than .bed). The same logic applies to .vcf files. If `x` is a genotype matrix and `backingfile` is NULL, then a
#' than .bed). If `x` is a genotype matrix and `backingfile` is NULL, then a
#' temporary file will be created (but note that R will delete it at the end of
#' the session!)
#' @param quiet provide information on the files used to store the data
Expand Down Expand Up @@ -137,9 +139,8 @@ gen_tibble.character <-
stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
}

loci <- show_loci(x_gt)
new_loci <- add_chromosome_as_int(loci)
show_loci(x_gt) <- new_loci
# create a chr_int column
show_loci(x_gt)$chr_int <- cast_chromosome_to_int(show_loci(x_gt)$chromosome)

file_in_use <- gt_save_light(x_gt, quiet = quiet)
return(x_gt)
Expand Down Expand Up @@ -313,9 +314,8 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
remove_on_fail = TRUE)
show_loci(new_gen_tbl) <- harmonise_missing_values(show_loci(new_gen_tbl), missing_alleles = missing_alleles)

loci <- show_loci(new_gen_tbl)
new_loci <- add_chromosome_as_int(loci)
show_loci(new_gen_tbl) <- new_loci
# create a chr_int column
show_loci(new_gen_tbl)$chr_int <- cast_chromosome_to_int(show_loci(new_gen_tbl)$chromosome)

files_in_use <- gt_save(new_gen_tbl, quiet = quiet)
return(new_gen_tbl)
Expand Down Expand Up @@ -572,39 +572,23 @@ change_duplicated_file_name <- function(file){
}

# adding a chr_int column
add_chromosome_as_int <- function(loci){


if(is.integer(loci$chromosome)){
loci$chr_int <- loci$chromosome

} else if(is.numeric(loci$chromosome)){
non_na_id <- !is.na(loci$chromosome)
chr_int <- as.integer(loci$chromosome[non_na_id])

loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
loci$chr_int[non_na_id] <- chr_int

} else if(is.character(loci$chromosome)){
non_na_id <- !is.na(loci$chromosome)
chr_factor <- as.factor(loci$chromosome[non_na_id])
chr_int <- as.integer(chr_factor)

loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
loci$chr_int[non_na_id] <- chr_int


} else if(is.factor(loci$chromosome)){
non_na_id <- !is.na(loci$chromosome)
chr_int <- as.integer(loci$chromosome[non_na_id])

loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
loci$chr_int[non_na_id] <- chr_int

cast_chromosome_to_int <- function(chromosome){
# if chromosome is an factor, then cast it to character
if (is.factor(chromosome)){
chromosome <- as.character(chromosome)
}
# if chromosome is a character, then cast it to integer
if (is.character(chromosome)){
chromosome <- tryCatch(as.integer(chromosome), warning = function(e) {as.integer(as.factor(chromosome))})
}
if (is.numeric(chromosome)){
chromosome <- as.integer(chromosome)
}
if (is.integer(chromosome)){
return(chromosome)
} else {

stop("Chromosome column should be integer, character, or factor")
}
return(loci)

}

2 changes: 1 addition & 1 deletion R/gt_save.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ gt_save <- function(x, file_name = NULL, quiet = FALSE) {
return(c(file_name,gt_get_file_names(x)))
}

#' a light version of gt_save that does not resave the RDS, to be used internally
#' a light version of gt_save that does not resave the bigSNP RDS or backing file, to be used internally
#' when creating a gen_tibble (if we have just created it, there is not need
#' to resave it)
#' @param x a [`gen_tibble`]
Expand Down
13 changes: 11 additions & 2 deletions R/loci_ld_clump.R
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,16 @@ loci_ld_clump.vctrs_bigSNP <- function(.x,
# rows (individuals) that we want to use
rows_to_keep <- vctrs::vec_data(.x)
if (use_positions){
.positions <- show_loci(.x)$position
#.positions <- show_loci(.x)$position
#.positions <- attr(.x,"bigsnp")$map$physical.pos
.positions <- rep(NA, nrow(attr(.x,"bigsnp")$map))
.positions[show_loci(.x)$big_index] <- show_loci(.x)$position
} else {
.positions <- NULL
}
# create a chromosome vector (fill gaps between bigsnpr and show_loci)
.chromosome <- rep(2147483647L, nrow(attr(.x,"bigsnp")$map))
.chromosome[show_loci(.x)$big_index] <- show_loci(.x)$chr_int
# now figure out if we have any snp which have already been removed
# those will go into `exclude`
loci_not_in_tibble <- seq_len(nrow(attr(.x,"bigsnp")$map))[!seq_len(nrow(attr(.x,"bigsnp")$map)) %in%
Expand All @@ -95,7 +101,10 @@ loci_ld_clump.vctrs_bigSNP <- function(.x,

# as long as we have more than one individual
snp_clump_ids <- bigsnpr::snp_clumping(G = attr(.x,"bigsnp")$genotypes,
infos.chr = show_loci(.x)$chr_int,
#infos.chr = show_loci(.x)$chr_int,
# TEMP HACK using the info from the bigsnpr object
#infos.chr = cast_chromosome_to_int(attr(.x,"bigsnp")$map$chromosome),
infos.chr = .chromosome,
ind.row = vctrs::vec_data(.x),
S = S,
thr.r2 = thr_r2,
Expand Down
3 changes: 0 additions & 3 deletions R/snp_king.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@
#'
#' This function computes the KING-robust estimator of kinship.
#'
#' The last step is not optimised yet, as it does the division of the num by the
#' den all in memory (on my TODO list...).
#'
#' @param X a [bigstatsr::FBM.code256] matrix (as found in the `genotypes`
#' slot of a [bigsnpr::bigSNP] object).
#' @param ind.row An optional vector of the row indices that are used.
Expand Down
5 changes: 0 additions & 5 deletions R/vcf_to_fbm_cpp.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,6 @@ vcf_to_fbm_cpp <- function(
# add the new columns
file_backed_matrix$add_columns(chunk_info$num_loci)
# fill them in
# file_backed_matrix[
# ,
# index_start:(index_start+chunk_info$num_loci-1)
# ] <- genotypes_matrix[,1:chunk_info$num_loci]

write_to_FBM(file_backed_matrix,
allele_counts = genotypes_matrix,
col_start = index_start-1,
Expand Down
7 changes: 6 additions & 1 deletion R/vcf_to_fbm_vcfR.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,12 @@ poly_indiv_dosage <- function (x, max_ploidy){
# get dosages for a genotype x and return them as raw for inclusion in the filebacked matrix
poly_genotype_dosage <- function (x, max_ploidy){
if (!is.na(x[1]) && x[1]!="."){
as.raw(sum(as.numeric(x)))
dosage <- sum(as.numeric(x))
if (dosage<(max_ploidy+1)){
return(as.raw(dosage))
} else{
stop("a genotype has more than max_ploidy alleles. We estimate max_ploidy from the first variant in the vcf file, make sure that variant is representative of ploidy (e.g. it is not on a sex chromosome).")
}
} else {
return(as.raw(max_ploidy+1))
}
Expand Down
40 changes: 0 additions & 40 deletions data-raw/ideas/allele_sharing.R

This file was deleted.

111 changes: 0 additions & 111 deletions data-raw/ideas/fst_gt_compare_to_hier.R

This file was deleted.

Loading

0 comments on commit 6bbdd7b

Please sign in to comment.