Skip to content

Commit

Permalink
Merge pull request #30 from EvolEcolGroup/king_test
Browse files Browse the repository at this point in the history
King test
  • Loading branch information
dramanica authored May 3, 2024
2 parents ffad4b2 + c8c38a3 commit 1109f1d
Show file tree
Hide file tree
Showing 181 changed files with 6,196 additions and 16,880 deletions.
4 changes: 3 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ Imports:
bigsnpr,
bigstatsr,
generics,
genio,
ggplot2,
magrittr,
methods,
MASS,
patchwork,
rlang,
Expand All @@ -37,6 +37,8 @@ Imports:
vctrs
Suggests:
adegenet,
broom,
hierfstat,
knitr,
detectRUNS,
LEA,
Expand Down
41 changes: 25 additions & 16 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ S3method("show_loci<-",vctrs_bigSNP)
S3method(augment,gt_dapc)
S3method(augment,gt_pca)
S3method(augment_loci,gt_pca)
S3method(autoplot,gt_cluster_pca)
S3method(autoplot,gt_dapc)
S3method(autoplot,gt_pca)
S3method(autoplot,gt_pca_clust)
S3method(autoplot,indiv_qc_report)
S3method(autoplot,loci_qc_report)
S3method(autoplot,qc_report_indiv)
S3method(autoplot,qc_report_loci)
S3method(count_loci,tbl_df)
S3method(count_loci,vctrs_bigSNP)
S3method(dplyr_reconstruct,gen_tbl)
Expand All @@ -26,6 +26,8 @@ S3method(indiv_missingness,vctrs_bigSNP)
S3method(loci_alt_freq,grouped_df)
S3method(loci_alt_freq,tbl_df)
S3method(loci_alt_freq,vctrs_bigSNP)
S3method(loci_chromosomes,tbl_df)
S3method(loci_chromosomes,vctrs_bigSNP)
S3method(loci_hwe,grouped_df)
S3method(loci_hwe,tbl_df)
S3method(loci_hwe,vctrs_bigSNP)
Expand All @@ -38,6 +40,8 @@ S3method(loci_maf,vctrs_bigSNP)
S3method(loci_missingness,grouped_df)
S3method(loci_missingness,tbl_df)
S3method(loci_missingness,vctrs_bigSNP)
S3method(loci_names,tbl_df)
S3method(loci_names,vctrs_bigSNP)
S3method(loci_sums,grouped_df)
S3method(loci_sums,tbl_df)
S3method(loci_sums,vctrs_bigSNP)
Expand All @@ -48,8 +52,6 @@ S3method(show_genotypes,tbl_df)
S3method(show_genotypes,vctrs_bigSNP)
S3method(show_loci,tbl_df)
S3method(show_loci,vctrs_bigSNP)
S3method(show_loci_names,tbl_df)
S3method(show_loci_names,vctrs_bigSNP)
S3method(summary,rbind_report)
S3method(summary,vctrs_bigSNP)
S3method(tbl_sum,gen_tbl)
Expand All @@ -61,44 +63,51 @@ export(augment)
export(augment_loci)
export(autoplot)
export(count_loci)
export(filter_high_relatedness)
export(gen_tibble)
export(gt_as_genind)
export(gt_as_genlight)
export(gt_as_geno_lea)
export(gt_as_hierfstat)
export(gt_as_plink)
export(gt_cluster_pca)
export(gt_cluster_pca_best_k)
export(gt_dapc)
export(gt_get_file_names)
export(gt_has_imputed)
export(gt_ibs)
export(gt_impute_simple)
export(gt_king)
export(gt_load)
export(gt_pca_autoSVD)
export(gt_pca_clust_best_k)
export(gt_pca_find_clusters)
export(gt_pca_partialSVD)
export(gt_pca_randomSVD)
export(gt_roh_window)
export(gt_save)
export(gt_set_imputed)
export(gt_uses_imputed)
export(gt_vcf_to_bed)
export(gt_write_bed_from_dfs)
export(gt_write_lea_geno)
export(gt_write_plink)
export(indiv_het_obs)
export(indiv_missingness)
export(indiv_qc_report)
export(loci_alt_freq)
export(loci_chromosomes)
export(loci_hwe)
export(loci_ld_clump)
export(loci_maf)
export(loci_missingness)
export(loci_qc_report)
export(loci_names)
export(loci_sums)
export(pop_pairwise_fst)
export(pairwise_allele_sharing)
export(pairwise_ibs)
export(pairwise_pop_fst)
export(pop_fis)
export(pop_fst)
export(qc_report_indiv)
export(qc_report_loci)
export(rbind_dry_run)
export(select_loci)
export(select_loci_if)
export(show_genotypes)
export(show_loci)
export(show_loci_names)
export(snp_allele_sharing)
export(snp_ibs)
export(snp_king)
export(tidy)
Expand Down
8 changes: 6 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@ SNPHWE_midp_t <- function(obs_hets, obs_hom1, obs_hom2, thresh) {
.Call(`_tidypopgen_SNPHWE_midp_t`, obs_hets, obs_hom1, obs_hom2, thresh)
}

increment_as_counts <- function(k, k2, na_mat, dos_mat, BM, rowInd, colInd) {
invisible(.Call(`_tidypopgen_increment_as_counts`, k, k2, na_mat, dos_mat, BM, rowInd, colInd))
}

increment_ibs_counts <- function(k, k2, genotype0, genotype1, genotype2, BM, rowInd, colInd) {
invisible(.Call(`_tidypopgen_increment_ibs_counts`, k, k2, genotype0, genotype1, genotype2, BM, rowInd, colInd))
}

increment_king_numerator <- function(k, genotype0, genotype1, genotype2, BM, rowInd, colInd) {
invisible(.Call(`_tidypopgen_increment_king_numerator`, k, genotype0, genotype1, genotype2, BM, rowInd, colInd))
increment_king_numerator <- function(k, n_Aa_i, genotype0, genotype1, genotype2, genotype_valid, BM, rowInd, colInd) {
invisible(.Call(`_tidypopgen_increment_king_numerator`, k, n_Aa_i, genotype0, genotype1, genotype2, genotype_valid, BM, rowInd, colInd))
}

6 changes: 3 additions & 3 deletions R/augment_loci.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
#'
#' `augment_loci` add columns to the loci table of a `gen_tibble` related
#' to information from a given analysis.
#' @param x A `gt_pca` object returned by one of the `gt_pca_*` functions.
#' @param x An object returned by one of the `gt_` functions (e.g. [gt_pca()]).
#' @param data the `gen_tibble` used to run the PCA.
#' @param ... Additional paramters passed to the individual methods.
#' @return A [gen_tibble] with additiona columns added to the loci tibble (accessible
#' @param ... Additional parameters passed to the individual methods.
#' @return A [gen_tibble] with additional columns added to the loci tibble (accessible
#' with [show_loci()]. If `data` is missing, a tibble of the information, with a
#' column `.rownames` giving the loci names.
#' @export
Expand Down
2 changes: 1 addition & 1 deletion R/autoplot_gt_dapc.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#' columns should be used.
#' @param ... not currently used.
#' @returns a `ggplot2` object
#' @rdname autoplot_gt_pca
#' @rdname autoplot_gt_dapc
#' @export
autoplot.gt_dapc <- function(object,
type=c("screeplot", "scores", "loadings", "components"),
Expand Down
2 changes: 1 addition & 1 deletion R/autoplot_gt_pca.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#' e.g. c(1,2); for `loadings` either one or more values.
#' @param ... not currently used.
#' @returns a `ggplot2` object
#' @rdname autoplot_gt_pca
#' @name autoplot_gt_pca
#' @export
autoplot.gt_pca <- function(object,
type=c("screeplot", "scores","loadings"),
Expand Down
117 changes: 117 additions & 0 deletions R/filter_high_relatedness.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#' Filter individuals based on a relationship threshold
#'
#' This function takes a matrix of x by y individuals containing relatedness
#' coefficients and returns the maximum set of individuals that contains no
#' relationships above the given threshold.
#'
#' TODO this function needs a test
#'
#' @param matrix a square symmetric matrix of individuals containing relationship coefficients
#' @param .x a [`gen_tibble`] object
#' @param kings_threshold a threshold over which
#' @param verbose boolean whether to report to screen
#' @return a list where '1' is individual ID's to retain, '2'
#' is individual ID's to remove, and '3' is a boolean where individuals to keep
#' are TRUE and individuals to remove are FALSE
#' @rdname filter_high_relatedness
#' @export
#'
filter_high_relatedness <-
function(matrix, .x = NULL, kings_threshold = NULL, verbose = FALSE) {

var_num <- dim(matrix)[1]

if(is.null(dimnames(matrix))){
if(is.null(.x)){
colnames(matrix) <- 1:ncol(matrix)
rownames(matrix) <- 1:nrow(matrix)
} else {
colnames(matrix) <- (.x)$id
rownames(matrix) <- (.x)$id
}
}

var_names <- dimnames(matrix)[[1]]

matrix <- abs(matrix)

# re-ordered columns based on max absolute correlation
original_order <- 1:var_num

average_corr <-
function(matrix) {
mean(matrix, na.rm = TRUE)
}
tmp <- matrix
diag(tmp) <- NA

max_abs_cor_order <-
order(apply(tmp, 2, average_corr), decreasing = TRUE)
matrix <- matrix[max_abs_cor_order, max_abs_cor_order]
newOrder <- original_order[max_abs_cor_order]
rm(tmp)

col_to_delete <- rep(FALSE, var_num)

matrix2 <- matrix
diag(matrix2) <- NA

for (i in 1:(var_num - 1)) {
if (!any(matrix2[!is.na(matrix2)] > kings_threshold)) {
if (verbose) {
message("All correlations <=", kings_threshold, "\n")
}
break()
}
if (col_to_delete[i]) {
next
}
for (j in (i + 1):var_num) {
if (!col_to_delete[i] & !col_to_delete[j]) {
if (matrix[i, j] > kings_threshold) {
mn1 <- mean(matrix2[i, ], na.rm = TRUE)
mn2 <- mean(matrix2[-j, ], na.rm = TRUE)
if (verbose) {
message(
"Compare row",
newOrder[i],
" and column ",
newOrder[j],
"with corr ",
round(matrix[i, j], 3),
"\n"
)
}
if (verbose) {
message(" Means: ", round(mn1, 3), "vs", round(mn2, 3))
}
if (mn1 > mn2) {
col_to_delete[i] <- TRUE
matrix2[i, ] <- NA
matrix2[, i] <- NA
if (verbose) {
message(" so flagging column", newOrder[i], "\n")
}
} else {
col_to_delete[j] <- TRUE
matrix2[j, ] <- NA
matrix2[, j] <- NA
if (verbose) {
message(" so flagging column", newOrder[j], "\n")
}
}
}
}
}
}


# return variable names
passed_filter <- var_names[newOrder][!col_to_delete]
#attr(passed_filter, "to_remove")
to_remove <- var_names[!var_names %in% passed_filter]

var_names <- var_names %in% passed_filter == TRUE

return(list(passed_filter,to_remove,var_names))
}
39 changes: 23 additions & 16 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
#' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
#' here the format
#' @param x can be:
#' - a string giving the path to a plink BED file, or a [`bigsnpr::bigSNP`]
#' - a string giving the path to a PLINK BED or PED file. The correspective
#' BIM and FAM fiel for the BED, or MAP for PED are expected to be in the same
#' directory and have the same file name.
#' - a string giving the path to a RDS file storing a `bigSNP` object from
#' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
#' - a string giving the path to a vcf file. Note that we currently read the whole
Expand All @@ -21,7 +23,7 @@
#' 'C' and 'G'.
#' @param missing_alleles a vector of values in the BIM file/loci dataframe that
#' indicate a missing value for the allele value (e.g. when we have a monomorphic
#' locus with only one allele). It defalts to '0' and '.' (the same as PLINK 1.9).
#' locus with only one allele). It defaults to '0' and '.' (the same as PLINK 1.9).
#' @param backingfile the path, including the file name without extension,
#' for backing files used to store the data (they will be given a .bk
#' and .RDS automatically). This is not needed if `x` is already an .RDS file.
Expand Down Expand Up @@ -72,12 +74,18 @@ gen_tibble.character <-
missing_alleles= missing_alleles,
backingfile = backingfile,
quiet = quiet)
} else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="vcf.gz")){
} else if ((tolower(file_ext(x))=="vcf") || (tolower(file_ext(x))=="gz")){
gen_tibble_vcf(x = x, ...,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile, quiet = quiet)
} else {
} else if (tolower(file_ext(x))=="ped"){
gen_tibble_ped(x = x, ...,
valid_alleles= valid_alleles,
missing_alleles= missing_alleles,
backingfile = backingfile,
quiet = quiet)
} else {
stop("file_path should be pointing to a either a PLINK .bed file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
}
}
Expand Down Expand Up @@ -155,7 +163,7 @@ gen_tibble_vcf <- function(x, ...,
ind_meta <- tibble(id = colnames(x), population = NA)

# using the gen_tibble.matrix method
new_gen_tbl <- gen_tibble(x = x,
new_gen_tbl <- gen_tibble(x = t(x),
indiv_meta = ind_meta,
loci = loci,
backingfile = backingfile)
Expand Down Expand Up @@ -195,6 +203,14 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
stop("'x' is not a matrix of integers")
}

# check dimensions
if (ncol(x)!=nrow(loci)){
stop ("there is a mismatch between the number of loci in the genotype table x and in the loci table")
}
if (nrow(x)!=nrow(indiv_meta)){
stop ("there is a mismatch between the number of loci in the genotype table x and in the loci table")
}

# use code for NA in FBM.256
x[is.na(x)]<-3

Expand Down Expand Up @@ -338,7 +354,7 @@ stopifnot_gen_tibble <- function(.x){
#' @export
tbl_sum.gen_tbl <- function(x, ...) {
c(
"A gen_tibble" = paste(nrow(show_loci(x))," loci"),
"A gen_tibble" = paste(count_loci(x)," loci"),
NextMethod()
)
}
Expand All @@ -361,7 +377,7 @@ check_allele_alphabet <- function(x,
# loci_info is usually from show_loci()
harmonise_missing_values <- function (loci_info, missing_alleles =c("0",".")){
# 0 is always considered as a missing value
if ("0" %in% missing_alleles){
if (!"0" %in% missing_alleles){
missing_alleles <- c(missing_alleles, "0")
}
loci_info$allele_ref[loci_info$allele_ref %in% missing_alleles]<-NA
Expand All @@ -372,13 +388,4 @@ harmonise_missing_values <- function (loci_info, missing_alleles =c("0",".")){



##########################################
# convenient functs
.gt_bigsnp_cols <- function(.x){
show_loci(.x)$big_index
}

.gt_bigsnp_rows <- function(.x){
vctrs::vec_data(.x$genotypes)
}

Loading

0 comments on commit 1109f1d

Please sign in to comment.