EvolEcolGroup · dramanica · Sep 25, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -6,6 +6,7 @@ S3method("show_loci<-",tbl_df)
 S3method("show_loci<-",vctrs_bigSNP)
 S3method(augment,gt_dapc)
 S3method(augment,gt_pca)
+S3method(augment,q_matrix)
 S3method(augment_loci,gt_pca)
 S3method(autoplot,gt_cluster_pca)
 S3method(autoplot,gt_dapc)

diff --git a/R/as_q_matrix.R b/R/as_q_matrix.R
@@ -19,8 +19,6 @@
 
 
 
-
-
 #' Tidy a Q matrix
 #'
 #' Takes a `q_matrix` object, which is a matrix, and returns a tidied tibble.
@@ -33,44 +31,63 @@
 #' @export
 tidy.q_matrix <- function(x, data, ...){
   rlang::check_dots_empty()
+
   q_tbl <- x %>%
-    tibble::as_tibble() %>%
+    tibble::as_tibble()
+
+  q_tbl <- q_tbl  %>%
     dplyr::mutate(id = data$id,
-                  # @TODO we should get this from the grouped tibble, not hardcode it!
+                  #@TODO
                   group = data$population)
 
-  q_tbl <- q_tbl %>% tidyr::pivot_longer(cols = dplyr::starts_with(".Q"),
-                                         names_to = "q", values_to = "percentage")
-  q_tbl$percentage <- as.numeric(q_tbl$percentage)
-  #q_tbl
-  dominant_q <- q_tbl %>%
-    dplyr::group_by(.data$id) %>%
-    dplyr::summarize(dominant_pop = .data$group[which.max(.data$percentage)], dominant_q = max(.data$percentage), .groups = 'drop')
-
-  q_tbl <- q_tbl %>%
-    dplyr::left_join(dominant_q, by = "id")
-
-  q_tbl <- q_tbl %>%
-    dplyr::arrange(.data$group, dplyr::desc(.data$dominant_q)) %>%
-    dplyr::mutate(plot_order = dplyr::row_number(),  # Create plot_order column
-           id = factor(.data$id, levels = unique(.data$id[order(.data$group, -.data$dominant_q)])))
   q_tbl
 }
 
 
+#' Augment data with information from a q_matrix object
+#'
+#' Augment for `q_matrix` accepts a model object and a dataset and adds
+#' Q values to each observation in the dataset.
+#' Q values  are stored in separate columns, which is given name with the
+#' pattern ".Q1",".Q2", etc. For consistency with [broom::augment.prcomp], a column
+#' ".rownames" is also returned; it is a copy of 'id', but it ensures that
+#' any scripts written for data augmented with [broom::augment.prcomp] will
+#' work out of the box (this is especially helpful when adapting plotting scripts).
+#' @param x  A `q_matrix` object
+#' @param data the `gen_tibble` used to run the clustering algorithm
+#' @param ... Not used. Needed to match generic signature only.
+#' @return A  [gen_tibble] containing the original data along with
+#'   additional columns containing each observation's Q values.
+#' @export
+#' @name augment_q_matrix
+
+augment.q_matrix <- function(x, data = NULL, ...) {
+
+  if (!".rownames" %in% names(data)) {
+    data <- data %>%
+      dplyr::mutate(.rownames = data$id)
+  }
+
+
+  q_tbl <- tidy(x,data)
+
+  data <- dplyr::left_join(data,q_tbl, by = "id")
+
+}
+
+
 #' Tidy ADMXITURE output files into plots
 #'
 #' Takes the name of a directory containing .Q file outputs, and
 #' produces a list of tidied tibbles ready to plot.
 #'
 #' @param x the name of a directory containing .Q files
-#' @param data  An associated tibble (e.g. a [`gen_tibble`]), with the individuals in the same order as the data used to
-#' generate the Q matrix
 #' @returns a list of `q_matrix` objects to plot
 #'
 #' @export
 
-read_q_matrix_list <- function(x, data){
+read_q_matrix_list <- function(x){
+
 
   files <- list.files(x, pattern = "\\.Q$", full.names = TRUE)
 
@@ -83,9 +100,6 @@
   # Sort matrix_list by the number of columns
   matrix_list <- matrix_list[order(sapply(matrix_list, ncol))]
 
-  # Tidy each
-  matrix_list <- lapply(matrix_list, function(x) tidy(x, gen_tbl = data))
-
   matrix_list
 }
 
@@ -111,9 +125,36 @@
     q_tbl$id <- 1:nrow(q_tbl)
     q_tbl <- q_tbl %>% tidyr::pivot_longer(cols = dplyr::starts_with(".Q"),
                                        names_to = "q", values_to = "percentage")
+    plt <- ggplot2::ggplot(q_tbl,
+                           ggplot2::aes(x = .data$id,
+                                        y = .data$percentage,
+                                        fill = .data$q)) +
+      ggplot2::geom_col(width = 1,
+                        position = ggplot2::position_stack(reverse = TRUE))+
+      ggplot2::labs(y = paste("K = ", K))+
+      theme_distruct() +
+      scale_fill_distruct()
+
+    plt
+
   } else {
     q_tbl <- tidy(object, data)
-  }
+
+    q_tbl <- q_tbl %>% tidyr::pivot_longer(cols = dplyr::starts_with(".Q"),
+                                           names_to = "q", values_to = "percentage")
+    q_tbl$percentage <- as.numeric(q_tbl$percentage)
+
+    dominant_q <- q_tbl %>%
+      dplyr::group_by(.data$id) %>%
+      dplyr::summarize(dominant_q = max(.data$percentage), .populations = 'drop')
+
+    q_tbl <- q_tbl %>%
+      dplyr::left_join(dominant_q, by = "id")
+
+    q_tbl <- q_tbl %>%
+      dplyr::arrange(.data$group, dplyr::desc(.data$dominant_q)) %>%
+      dplyr::mutate(plot_order = dplyr::row_number(),  # Create plot_order column
+             id = factor(.data$id, levels = unique(.data$id[order(.data$group, -.data$dominant_q)])))
     plt <- ggplot2::ggplot(q_tbl,
                            ggplot2::aes(x = .data$id,
                                         y = .data$percentage,
@@ -131,6 +172,8 @@
       }
     }
     plt
+  }
+
 }
 
 

diff --git a/R/gen_tibble.R b/R/gen_tibble.R
@@ -90,6 +90,8 @@ gen_tibble.character <-
     stop ("'0' can not be a valid allele (it is the default missing allele value!)")
   }
 
+  backingfile <- filenaming(backingfile)
+
   if ((tolower(file_ext(x))=="bed") || (tolower(file_ext(x))=="rds")){
     rlang::check_dots_empty()
     x_gt <- gen_tibble_bed_rds(x = x, ...,
@@ -250,6 +252,8 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
     stop ("there is a mismatch between the number of loci in the genotype table x and in the loci table")
   }
 
+  backingfile <- filenaming(backingfile)
+
   # use code for NA in FBM.256
 #  x[is.na(x)]<-3
 
@@ -468,6 +472,39 @@ harmonise_missing_values <- function (loci_info, missing_alleles =c("0",".")){
 }
 
 
+# check for existing .bk files
+filenaming <- function(file){
+
+  bk <- paste0(file, ".bk")
+  rds <- paste0(file, ".rds")
+
+  if(file.exists(bk) & !file.exists(rds)){
+
+    version <- 1
+
+    base_name <- basename(file)
+
+    version_pattern <- paste0(base_name, "_v(\\d+)\\.bk$")
+
+    # read existing files to check for existing versions
+  existing_files <- list.files(dirname(bk), pattern = paste0("^", base_name, "_v\\d+\\.bk$"))
+
 
+    if (length(existing_files) > 0) {
+      versions <- sub(version_pattern, "\\1", existing_files)
+      versions <- as.numeric(versions)
+      if (!any(is.na(versions))) {
+        version <- max(versions) + 1
+      }
+    }
+
+    new_file <- paste0(file,"_v",version)
+
+    return(new_file)
+  }
+
+  return(file)
+
+}
 
 
diff --git a/R/group_by.R b/R/group_by.R
@@ -33,7 +33,6 @@ dplyr_reconstruct.gen_tbl <-function(data, template)
 dplyr_reconstruct.grouped_gen_tbl <-function(data, template)
 {
   out <- NextMethod()
-  browser()
   # if the genotypes are gone, drop the tbl_df class
   if (!"genotypes" %in% names(data)){
     message("as genotypes were dropped, this is not longer a 'gen_tbl'")

diff --git a/R/gt_as_plink.R b/R/gt_as_plink.R
@@ -31,7 +31,7 @@ gt_as_plink <- function(x, file = NULL, type = c("bed","ped","raw"),
                    )
   } else if (type=="ped"){
     all_files <- c(file,
-                   gsub(".ped",".fam",file))
+                   gsub(".ped",".map",file))
   } else if (type=="raw"){
     all_files <- file
   }

diff --git a/R/qc_report_loci.R b/R/qc_report_loci.R
@@ -196,7 +196,8 @@ autoplot_l_qc_sig_hwe <- function(object,hwe_p=hwe_p,logp,...){
 
   #Hardy weinberg exact test p-val distribution
   qc_report$hwe_p_log <- -log10(qc_report$hwe_p)
-  qc_lowhwe <- subset(qc_report,qc_report$hwe_p < logp)
+
+  qc_lowhwe <- qc_report[qc_report$hwe_p < hwe_p, ]
 
   hwe_low <- ggplot2::ggplot(qc_lowhwe,ggplot2::aes(x=.data$hwe_p_log))+ggplot2::geom_histogram(binwidth = 0.5,fill="#66C2A5")+ ggplot2::labs(x=expression("-log"[10]* " of HWE exact p-value"),y="Number of SNPs", title = paste("HWE exact p-value <", hwe_p))+ ggplot2::geom_vline(xintercept= logp, lty=2, col="red")
 }

diff --git a/man/augment_q_matrix.Rd b/man/augment_q_matrix.Rd
diff --git a/man/read_q_matrix_list.Rd b/man/read_q_matrix_list.Rd
diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R
@@ -49,7 +49,7 @@ test_that("gen_tibble catches invalid alleles",{
   test_loci_wrong <- test_loci
   test_loci_wrong$allele_alt[1] <- "N"
   expect_error(test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
-                                           loci = test_loci_wrong, quiet = TRUE),"valid alleles are")
+                                           loci = test_loci_wrong, quiet = TRUE,"valid alleles are"))
   # now add N to the valid alleles
   test_dfs_gt <- gen_tibble(test_genotypes, indiv_meta = test_indiv_meta,
                                          loci = test_loci_wrong,
@@ -417,6 +417,57 @@ test_that("check summary stats are the same for gen_tibbles read in different wa
 
 })
 
+
+test_indiv_meta <- data.frame (id=c("a","b","c"),
+                               population = c("pop1","pop1","pop2"))
+test_genotypes <- rbind(c(1,1,0,1,1,0),
+                        c(2,1,0,0,0,0),
+                        c(2,2,0,0,1,1))
+test_loci <- data.frame(name=paste0("rs",1:6),
+                        chromosome=paste0("chr",c(1,1,1,1,2,2)),
+                        position=as.integer(c(3,5,65,343,23,456)),
+                        genetic_dist = as.integer(rep(0,6)),
+                        allele_ref = c("A","T","C","G","C","T"),
+                        allele_alt = c("T","C", NA,"C","G","A"))
+
+
+test_gt <- gen_tibble(x = test_genotypes, loci = test_loci,
+                      indiv_meta = test_indiv_meta, quiet = TRUE,
+                      backingfile = tempfile())
+
+test_that("versioning if .bk already exists",{
+
+  files <-  gt_get_file_names(test_gt)
+
+  file.remove(files[1])
+
+  file <- gsub(".bk","",files[2],)
+
+  test_gt <- gen_tibble(x = test_genotypes, loci = test_loci,
+                        indiv_meta = test_indiv_meta, quiet = TRUE,
+                        backingfile = file)
+
+  new_files <- gt_get_file_names(test_gt)
+
+  expect_equal(new_files[2], paste0(file,"_v1.bk"))
+
+  file.remove(new_files[1])
+
+  file.exists(new_files[1])
+  file.exists(new_files[2])
+
+  test_gt <- gen_tibble(x = test_genotypes, loci = test_loci,
+                        indiv_meta = test_indiv_meta, quiet = TRUE,
+                        backingfile = file)
+
+  new_version_files <- gt_get_file_names(test_gt)
+
+  expect_equal(new_version_files[2], paste0(file,"_v2.bk"))
+
+})
+
+
+
 # Windows prevents the deletion of the backing file. It's something to do with the memory mapping
 # library used by bigsnpr
 # test_that("on error, we remove the old files",{