Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hwe plot #51

Merged
merged 7 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ S3method("show_loci<-",tbl_df)
S3method("show_loci<-",vctrs_bigSNP)
S3method(augment,gt_dapc)
S3method(augment,gt_pca)
S3method(augment,q_matrix)
S3method(augment_loci,gt_pca)
S3method(autoplot,gt_cluster_pca)
S3method(autoplot,gt_dapc)
Expand Down
95 changes: 69 additions & 26 deletions R/as_q_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@





#' Tidy a Q matrix
#'
#' Takes a `q_matrix` object, which is a matrix, and returns a tidied tibble.
Expand All @@ -33,44 +31,63 @@
#' @export
tidy.q_matrix <- function(x, data, ...){
rlang::check_dots_empty()

q_tbl <- x %>%
tibble::as_tibble() %>%
tibble::as_tibble()

q_tbl <- q_tbl %>%
dplyr::mutate(id = data$id,
# @TODO we should get this from the grouped tibble, not hardcode it!
#@TODO
group = data$population)

q_tbl <- q_tbl %>% tidyr::pivot_longer(cols = dplyr::starts_with(".Q"),
names_to = "q", values_to = "percentage")
q_tbl$percentage <- as.numeric(q_tbl$percentage)
#q_tbl
dominant_q <- q_tbl %>%
dplyr::group_by(.data$id) %>%
dplyr::summarize(dominant_pop = .data$group[which.max(.data$percentage)], dominant_q = max(.data$percentage), .groups = 'drop')

q_tbl <- q_tbl %>%
dplyr::left_join(dominant_q, by = "id")

q_tbl <- q_tbl %>%
dplyr::arrange(.data$group, dplyr::desc(.data$dominant_q)) %>%
dplyr::mutate(plot_order = dplyr::row_number(), # Create plot_order column
id = factor(.data$id, levels = unique(.data$id[order(.data$group, -.data$dominant_q)])))
q_tbl
}


#' Augment data with information from a q_matrix object
#'
#' Augment for `q_matrix` accepts a model object and a dataset and adds
#' Q values to each observation in the dataset.
#' Q values are stored in separate columns, which is given name with the
#' pattern ".Q1",".Q2", etc. For consistency with [broom::augment.prcomp], a column
#' ".rownames" is also returned; it is a copy of 'id', but it ensures that
#' any scripts written for data augmented with [broom::augment.prcomp] will
#' work out of the box (this is especially helpful when adapting plotting scripts).
#' @param x A `q_matrix` object
#' @param data the `gen_tibble` used to run the clustering algorithm
#' @param ... Not used. Needed to match generic signature only.
#' @return A [gen_tibble] containing the original data along with
#' additional columns containing each observation's Q values.
#' @export
#' @name augment_q_matrix

augment.q_matrix <- function(x, data = NULL, ...) {

if (!".rownames" %in% names(data)) {
data <- data %>%
dplyr::mutate(.rownames = data$id)

Check warning on line 68 in R/as_q_matrix.R

View check run for this annotation

Codecov / codecov/patch

R/as_q_matrix.R#L67-L68

Added lines #L67 - L68 were not covered by tests
}


q_tbl <- tidy(x,data)

data <- dplyr::left_join(data,q_tbl, by = "id")

}


#' Tidy ADMXITURE output files into plots
#'
#' Takes the name of a directory containing .Q file outputs, and
#' produces a list of tidied tibbles ready to plot.
#'
#' @param x the name of a directory containing .Q files
#' @param data An associated tibble (e.g. a [`gen_tibble`]), with the individuals in the same order as the data used to
#' generate the Q matrix
#' @returns a list of `q_matrix` objects to plot
#'
#' @export

read_q_matrix_list <- function(x, data){
read_q_matrix_list <- function(x){


files <- list.files(x, pattern = "\\.Q$", full.names = TRUE)

Expand All @@ -83,9 +100,6 @@
# Sort matrix_list by the number of columns
matrix_list <- matrix_list[order(sapply(matrix_list, ncol))]

# Tidy each
matrix_list <- lapply(matrix_list, function(x) tidy(x, gen_tbl = data))

matrix_list
}

Expand All @@ -111,9 +125,36 @@
q_tbl$id <- 1:nrow(q_tbl)
q_tbl <- q_tbl %>% tidyr::pivot_longer(cols = dplyr::starts_with(".Q"),
names_to = "q", values_to = "percentage")
plt <- ggplot2::ggplot(q_tbl,
ggplot2::aes(x = .data$id,
y = .data$percentage,
fill = .data$q)) +
ggplot2::geom_col(width = 1,
position = ggplot2::position_stack(reverse = TRUE))+
ggplot2::labs(y = paste("K = ", K))+
theme_distruct() +
scale_fill_distruct()

Check warning on line 136 in R/as_q_matrix.R

View check run for this annotation

Codecov / codecov/patch

R/as_q_matrix.R#L128-L136

Added lines #L128 - L136 were not covered by tests

plt

Check warning on line 138 in R/as_q_matrix.R

View check run for this annotation

Codecov / codecov/patch

R/as_q_matrix.R#L138

Added line #L138 was not covered by tests

} else {
q_tbl <- tidy(object, data)
}

q_tbl <- q_tbl %>% tidyr::pivot_longer(cols = dplyr::starts_with(".Q"),
names_to = "q", values_to = "percentage")
q_tbl$percentage <- as.numeric(q_tbl$percentage)

dominant_q <- q_tbl %>%
dplyr::group_by(.data$id) %>%
dplyr::summarize(dominant_q = max(.data$percentage), .populations = 'drop')

q_tbl <- q_tbl %>%
dplyr::left_join(dominant_q, by = "id")

q_tbl <- q_tbl %>%
dplyr::arrange(.data$group, dplyr::desc(.data$dominant_q)) %>%
dplyr::mutate(plot_order = dplyr::row_number(), # Create plot_order column
id = factor(.data$id, levels = unique(.data$id[order(.data$group, -.data$dominant_q)])))
plt <- ggplot2::ggplot(q_tbl,
ggplot2::aes(x = .data$id,
y = .data$percentage,
Expand All @@ -131,6 +172,8 @@
}
}
plt
}

}


Expand Down
37 changes: 37 additions & 0 deletions R/gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ gen_tibble.character <-
stop ("'0' can not be a valid allele (it is the default missing allele value!)")
}

backingfile <- filenaming(backingfile)

if ((tolower(file_ext(x))=="bed") || (tolower(file_ext(x))=="rds")){
rlang::check_dots_empty()
x_gt <- gen_tibble_bed_rds(x = x, ...,
Expand Down Expand Up @@ -250,6 +252,8 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
stop ("there is a mismatch between the number of loci in the genotype table x and in the loci table")
}

backingfile <- filenaming(backingfile)

# use code for NA in FBM.256
# x[is.na(x)]<-3

Expand Down Expand Up @@ -468,6 +472,39 @@ harmonise_missing_values <- function (loci_info, missing_alleles =c("0",".")){
}


# check for existing .bk files
filenaming <- function(file){

bk <- paste0(file, ".bk")
rds <- paste0(file, ".rds")

if(file.exists(bk) & !file.exists(rds)){

version <- 1

base_name <- basename(file)

version_pattern <- paste0(base_name, "_v(\\d+)\\.bk$")

# read existing files to check for existing versions
existing_files <- list.files(dirname(bk), pattern = paste0("^", base_name, "_v\\d+\\.bk$"))


if (length(existing_files) > 0) {
versions <- sub(version_pattern, "\\1", existing_files)
versions <- as.numeric(versions)
if (!any(is.na(versions))) {
version <- max(versions) + 1
}
}

new_file <- paste0(file,"_v",version)

return(new_file)
}

return(file)

}


1 change: 0 additions & 1 deletion R/group_by.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ dplyr_reconstruct.gen_tbl <-function(data, template)
dplyr_reconstruct.grouped_gen_tbl <-function(data, template)
{
out <- NextMethod()
browser()
# if the genotypes are gone, drop the tbl_df class
if (!"genotypes" %in% names(data)){
message("as genotypes were dropped, this is not longer a 'gen_tbl'")
Expand Down
2 changes: 1 addition & 1 deletion R/gt_as_plink.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ gt_as_plink <- function(x, file = NULL, type = c("bed","ped","raw"),
)
} else if (type=="ped"){
all_files <- c(file,
gsub(".ped",".fam",file))
gsub(".ped",".map",file))
} else if (type=="raw"){
all_files <- file
}
Expand Down
3 changes: 2 additions & 1 deletion R/qc_report_loci.R
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ autoplot_l_qc_sig_hwe <- function(object,hwe_p=hwe_p,logp,...){

#Hardy weinberg exact test p-val distribution
qc_report$hwe_p_log <- -log10(qc_report$hwe_p)
qc_lowhwe <- subset(qc_report,qc_report$hwe_p < logp)

qc_lowhwe <- qc_report[qc_report$hwe_p < hwe_p, ]

hwe_low <- ggplot2::ggplot(qc_lowhwe,ggplot2::aes(x=.data$hwe_p_log))+ggplot2::geom_histogram(binwidth = 0.5,fill="#66C2A5")+ ggplot2::labs(x=expression("-log"[10]* " of HWE exact p-value"),y="Number of SNPs", title = paste("HWE exact p-value <", hwe_p))+ ggplot2::geom_vline(xintercept= logp, lty=2, col="red")
}
Expand Down
29 changes: 29 additions & 0 deletions man/augment_q_matrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 1 addition & 4 deletions man/read_q_matrix_list.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

53 changes: 52 additions & 1 deletion tests/testthat/test_gen_tibble.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ test_that("gen_tibble catches invalid alleles",{
test_loci_wrong <- test_loci
test_loci_wrong$allele_alt[1] <- "N"
expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
loci = test_loci_wrong, quiet = TRUE),"valid alleles are")
loci = test_loci_wrong, quiet = TRUE,"valid alleles are"))
# now add N to the valid alleles
test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
loci = test_loci_wrong,
Expand Down Expand Up @@ -417,6 +417,57 @@ test_that("check summary stats are the same for gen_tibbles read in different wa

})


test_indiv_meta <- data.frame (id=c("a","b","c"),
population = c("pop1","pop1","pop2"))
test_genotypes <- rbind(c(1,1,0,1,1,0),
c(2,1,0,0,0,0),
c(2,2,0,0,1,1))
test_loci <- data.frame(name=paste0("rs",1:6),
chromosome=paste0("chr",c(1,1,1,1,2,2)),
position=as.integer(c(3,5,65,343,23,456)),
genetic_dist = as.integer(rep(0,6)),
allele_ref = c("A","T","C","G","C","T"),
allele_alt = c("T","C", NA,"C","G","A"))


test_gt <- gen_tibble(x = test_genotypes, loci = test_loci,
indiv_meta = test_indiv_meta, quiet = TRUE,
backingfile = tempfile())

test_that("versioning if .bk already exists",{

files <- gt_get_file_names(test_gt)

file.remove(files[1])

file <- gsub(".bk","",files[2],)

test_gt <- gen_tibble(x = test_genotypes, loci = test_loci,
indiv_meta = test_indiv_meta, quiet = TRUE,
backingfile = file)

new_files <- gt_get_file_names(test_gt)

expect_equal(new_files[2], paste0(file,"_v1.bk"))

file.remove(new_files[1])

file.exists(new_files[1])
file.exists(new_files[2])

test_gt <- gen_tibble(x = test_genotypes, loci = test_loci,
indiv_meta = test_indiv_meta, quiet = TRUE,
backingfile = file)

new_version_files <- gt_get_file_names(test_gt)

expect_equal(new_version_files[2], paste0(file,"_v2.bk"))

})



# Windows prevents the deletion of the backing file. It's something to do with the memory mapping
# library used by bigsnpr
# test_that("on error, we remove the old files",{
Expand Down
Loading