EvolEcolGroup · dramanica · Jun 5, 2024 · May 21, 2024 · May 21, 2024 · May 22, 2024
diff --git a/R/gen_tibble.R b/R/gen_tibble.R
@@ -2,6 +2,10 @@
 #'
 #' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
 #' here the format
+#'
+#' When loading packedancestry files, missing alleles will be converted from
+#' 'X' to NA
+#'
 #' @param x can be:
 #' - a string giving the path to a PLINK BED or PED file. The associated
 #' BIM and FAM files for the BED, or MAP for PED are expected to be in the same
@@ -136,6 +140,7 @@ gen_tibble_bed_rds <- function(x, ...,
     ploidy <- bigsnp_obj$fam$ploidy
   } else {
     ploidy <- 2
+    bigsnp_obj$fam$ploidy <- 2
   }
 
   indiv_meta$genotypes <- new_vctrs_bigsnp(bigsnp_obj,
@@ -398,8 +403,11 @@ summary.vctrs_bigSNP <- function(object, ...){
 #' @keywords internal
 
 stopifnot_gen_tibble <- function(.x){
-  if ("gentoypes" %in% names(.x)){
-    stopifnot(.x$genotypes)
+  if (!"genotypes" %in% names(.x)){
+    stop("not a gen_tibble, 'genotype' column is missing")
+  }
+  if (!inherits(.x$genotypes,"vctrs_bigSNP")){
+    stop("not a gen_tibble, the genotype column is not of class vctrs_bigSNP")
   }
 }
 

diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R
@@ -27,10 +27,22 @@ gen_tibble_packedancestry <- function(x, ...,
   #TODO check that allele_ref and allele_alt are not swapped
   names(res$snp)<-c("name", "chromosome",'genetic_dist','position', 'allele_ref','allele_alt')
 
+  #browser()
+
+  allele_alt <- res$snp$allele_ref
+  allele_ref <- res$snp$allele_alt
+
+  loci <- cbind(res$snp[,c(1:4)], allele_ref, allele_alt)
+
+  xs <- which(loci$allele_ref == "X")
+
+  loci[xs, "allele_ref"] <- NA
+
   # using the gen_tibble.matrix method
   new_gen_tbl <- gen_tibble(x = res$geno,
                             indiv_meta = res$ind,
-                            loci = res$snp,
+                            #loci = res$snp,
+                            loci = loci,
                             backingfile = backingfile,
                             quiet=quiet,
                             valid_alleles = valid_alleles,

diff --git a/R/gt_cluster_pca_best_k.R b/R/gt_cluster_pca_best_k.R
@@ -27,7 +27,7 @@
 #' and validate the programmatic selection. The criteria available in this
 #' function are:
 #' - "diffNgroup": differences between successive values of the summary
-#'  statistics (by default, BIC) are splitted into two groups using a Ward's
+#'  statistics (by default, BIC) are split into two groups using a Ward's
 #'  clustering method (see ?hclust), to differentiate sharp decrease from mild
 #'  decreases or increases. The retained K is the one before the first group
 #'  switch. This criterion appears to work well for island/hierarchical models, and decently

diff --git a/R/gt_dapc.R b/R/gt_dapc.R
@@ -1,9 +1,9 @@
 #' Discriminant Analysis of Principal Components for gen_tibble
 #'
 #' This function implements the Discriminant Analysis of Principal Components
-#' (DAPC, Jombart et al. 2010). This method descibes the diversity between
+#' (DAPC, Jombart et al. 2010). This method describes the diversity between
 #' pre-defined groups. When groups are unknown, use [gt_cluster_pca()] to
-#' infer genetic clusters. See 'details' section for a succint
+#' infer genetic clusters. See 'details' section for a succinct
 #' description of the method, and the vignette in the package `adegenet`
 #' ("adegenet-dapc") for a
 #' tutorial. This function returns objects of class [`adegenet::dapc`] which are compatible

diff --git a/R/gt_extract_f2.R b/R/gt_extract_f2.R
@@ -39,7 +39,7 @@
 #' precomputing f2 from a [`gen_tibble`]).
 #' @param outpop Keep only SNPs which are heterozygous in this population
 #' @param outpop_scale Scale f2-statistics by the inverse `outpop`
-#' heteroygosity (`1/(p*(1-p))`). Providing `outpop` and setting
+#' heterozygosity (`1/(p*(1-p))`). Providing `outpop` and setting
 #' `outpop_scale` to `TRUE` will give the same results as the original
 #' *qpGraph* when the `outpop` parameter has been set, but it has the
 #' disadvantage of treating one population different from the others. This

diff --git a/R/gt_has_imputed.R b/R/gt_has_imputed.R
@@ -42,7 +42,7 @@ gt_uses_imputed <- function (x){
 #'
 #' This function sets or unsets the use of imputed data. For some analysis,
 #' such as PCA, that does not allow for missing data, we have to use imputation,
-#' but for other analysis it might be preferabble to allow for missing data.
+#' but for other analysis it might be preferable to allow for missing data.
 #'
 #' @param x a `gen_tibble`
 #' @param set a boolean defining whether imputed data should be used

diff --git a/R/gt_impute_simple.R b/R/gt_impute_simple.R
@@ -9,14 +9,13 @@
 #' @param method one of
 #' - 'mode': the most frequent genotype
 #' - 'mean0': the mean rounded to the nearest integer
-#' - 'mean2': the mean rounded to 2 decimal places
 #' - 'random': randomly sample a genotype based on the observed allele frequencies
 #' @param n_cores the number of cores to be used
 #' @returns a [gen_tibble] with imputed genotypes
 #' @export
 
 gt_impute_simple <-function(x,
-                            method = c("mode", "mean0", "mean2", "random"),
+                            method = c("mode", "mean0", "random"),
                             n_cores = 1) {
   if (!all.equal(attr(x$genotypes,"bigsnp")$genotypes$code256,bigsnpr::CODE_012)){
     if (all.equal(attr(x$genotypes,"bigsnp")$genotypes$code256, bigsnpr::CODE_IMPUTE_PRED) |

diff --git a/R/gt_pca_autoSVD.R b/R/gt_pca_autoSVD.R
@@ -72,16 +72,32 @@ gt_pca_autoSVD <- function(x, k = 10,
   X <- attr(x$genotypes,"bigsnp") # convenient pointer
   x_ind_col <- show_loci(x)$big_index
   x_ind_row <- vctrs::vec_data(x$genotypes)
+  # we need to create a chromosome vector that is as long as the complete bigsnp object
+  infos_chr<-rep(1,nrow(.gt_get_bigsnp(x)$map))
   if (is.character(show_loci(x)$chromosome)){
-    infos_chr <- as.numeric(factor(show_loci(x)$chromosome))
+
+    infos_chr[.gt_bigsnp_cols(x)] <- as.numeric(factor(show_loci(x)$chromosome))
+
   } else {
-    infos_chr <- show_loci(x)$chromosome
+    infos_chr[.gt_bigsnp_cols(x)] <- show_loci(x)$chromosome
+  }
+  # chromosomes have to be positive numbers
+  if (min(infos_chr)<1){
+    infos_chr <- infos_chr+abs(min(infos_chr)+1)
+  }
+  infos_pos <- NULL
+  if (use_positions){
+    infos_pos <- rep(0,nrow(.gt_get_bigsnp(x)$map))
+    infos_pos[.gt_bigsnp_cols(x)] <- show_loci(x)$position
   }
+  # hack to get around the use of a dataset by bigsnpr
+  # TODO remove after bignspr is patched
+  attachNamespace("bigsnpr")
   this_svd  <- bigsnpr::snp_autoSVD(X$genotypes,
                       infos.chr = infos_chr,
-                      infos.pos = if (use_positions) {show_loci(x)$position} else {NULL},
-                      ind.row = vctrs::vec_data(x$genotypes),
-                      ind.col = show_loci(x)$big_index,
+                      infos.pos = infos_pos,
+                      ind.row = .gt_bigsnp_rows(x),
+                      ind.col = .gt_bigsnp_cols(x),
                       fun.scaling = fun_scaling,
                       thr.r2 = thr_r2,
                       size = size,

diff --git a/R/gt_pca_tidiers.R b/R/gt_pca_tidiers.R
@@ -152,7 +152,7 @@ augment.gt_pca <- function(x, data = NULL, k= NULL, ...) {
 #' Augment for `gt_pca` accepts a model object and a `gen_tibble` and adds
 #' loadings for each locus to the loci table. Loadings for each component are stored in a
 #' separate column, which is given name with the pattern ".loadingPC1",
-#' ".loadingPC2", etc. If `data` is missing, then a tibble with the lodings is returned.
+#' ".loadingPC2", etc. If `data` is missing, then a tibble with the loadings is returned.
 #' @param x  A `gt_pca` object returned by one of the `gt_pca_*` functions.
 #' @param data the `gen_tibble` used to run the PCA.
 #' @param k the number of components to add

diff --git a/R/indiv_missingness.R b/R/indiv_missingness.R
@@ -1,12 +1,12 @@
 #' Estimate individual missingness
 #'
-#' Estimate missingnes for each individual (i.e. the frequency of
+#' Estimate missingness for each individual (i.e. the frequency of
 #' missing genotypes in an individual).
 #'
 #' @param .x a vector of class `vctrs_bigSNP` (usually the `genotype` column of
 #' a [`gen_tibble`] object),
 #' or a [`gen_tibble`].
-#' @param as_counts booelean defining whether the count of NAs (rather than the rate)
+#' @param as_counts boolean defining whether the count of NAs (rather than the rate)
 #' should be returned. It defaults to FALSE (i.e. rates are returned by default).
 #' @param ... currently unused.
 #' @returns a vector of heterozygosities, one per individuals in the [`gen_tibble`]

diff --git a/R/pairwise_pop_fst.R b/R/pairwise_pop_fst.R
@@ -53,8 +53,9 @@ pairwise_pop_fst <- function(.x, by_locus=FALSE,
 
 pairwise_pop_fst_hudson <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_cores()){
 
-  message("this function is not properly tested yet!!!")
-  message("if you have time, test the results against something like hierfstat and create a unit test")
+  #message("this function is not properly tested yet!!!")
+  #message("if you have time, test the results against something like hierfstat and create a unit test")
+  message("Whilst this function should work, it has not been extensively tested. Check your results to ensure they make sense")
   # get the populations
   .group_levels = .x %>% group_keys()
   # create all combinations
@@ -107,8 +108,9 @@ pairwise_pop_fst_hudson <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_
 # based on the formula in Bhatia 2013
 pairwise_pop_fst_wc84 <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_cores()){
 
-  message("this function is not properly tested yet!!!")
-  message("if you have time, test the results against something like hierfstat and create a unit test")
+  #message("this function is not properly tested yet!!!")
+  #message("if you have time, test the results against something like hierfstat and create a unit test")
+  message("Whilst this function should work, it has not been extensively tested. Check your results to ensure they make sense")
   # get the populations
   .group_levels = .x %>% group_keys()
   # create all combinations
@@ -162,8 +164,9 @@ pairwise_pop_fst_wc84 <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_co
 # based on the formula in Bhatia 2013
 pairwise_pop_fst_nei86 <- function(.x, by_locus=FALSE, n_cores = bigstatsr::nb_cores()){
 
-  message("this function is not properly tested yet!!!")
-  message("if you have time, test the results against something like hierfstat and create a unit test")
+  #message("this function is not properly tested yet!!!")
+  #message("if you have time, test the results against something like hierfstat and create a unit test")
+  message("Whilst this function should work, it has not been extensively tested. Check your results to ensure they make sense")
   # get the populations
   .group_levels = .x %>% group_keys()
   # create all combinations

diff --git a/R/qc_report_indiv.R b/R/qc_report_indiv.R
@@ -101,8 +101,10 @@ autoplot_qc_report_indiv_king <- function(object, kings_threshold = kings_thresh
   king$row <- colnames(king)
 
   #format into 3 columns: ID1, ID2, and their relatedness coefficient
-  # TODO gather is superseded, and should be rewritten with pivot_longer
-  king <- tidyr::gather(king, "column", "value", -"row")
+  king <- tidyr::pivot_longer(king, cols = !row, names_to = "Column",
+                              values_to = "Value")
+  king <- dplyr::filter(king, king$row < king$Column)
+
   colnames(king) <- c("ID1","ID2","kinship")
 
   #remove duplication from the new df

diff --git a/R/rbind_gen_tibble.R b/R/rbind_gen_tibble.R
@@ -119,15 +119,22 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
   t_ref_fbm$ncol<-t_ref_fbm$ncol+t_target_fbm$ncol
   # now flip the file around
   merged_fbm <- bigstatsr::big_transpose(t_ref_fbm, backingfile = backingfile) # TODO this should be written in the director of interest
+  # Make sure that the two fam tibbles have the same columns
+  ref_snp$fam <- add_missing_cols(ref_snp$fam, target_snp$fam)
+  target_snp$fam <- add_missing_cols(target_snp$fam, ref_snp$fam)
+  # now create a bigsnp object
   merged_snp <- structure(list(genotypes = merged_fbm,
                              fam = rbind(ref_snp$fam, target_snp$fam),
                              map = ref_snp$map),
                         class = "bigSNP")
   merged_rds <- paste0(backingfile,".rds")
   saveRDS(merged_snp, merged_rds)
   # Now we need to create the gen_tibble
-  # TODO we need to turn genotype into a string!!!
-  merged_tbl <- rbind(ref %>% select(-dplyr::any_of("genotypes")), target %>% select(-dplyr::any_of("genotypes")))
+  # Make sure that the two tibbles have the same columns
+  ref <- add_missing_cols(ref, target)
+  target <- add_missing_cols(target, ref)
+  merged_tbl <- rbind(ref %>% select(-dplyr::any_of("genotypes")),
+                      target %>% select(-dplyr::any_of("genotypes")))
   # make sure that the genotypes vector points to the correct rows
   vctrs::vec_data(ref$genotypes)
   #and finally append the loci table
@@ -138,8 +145,10 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
   merged_tbl$genotypes <- vctrs::new_vctr(match(indivs_with_big_names,merged_snp$fam$sample.ID), # TODO check that this is the correct order!!!!
                   bigsnp = merged_snp,
                   bigsnp_file = merged_rds,
+                  bigsnp_md5sum = tools::md5sum(merged_rds),
                   loci=new_ref_loci_tbl,
                   names=indivs_with_big_names,
+                  ploidy = 2, # TODO hardcoded as we currently only work for diploid
                   class = "vctrs_bigSNP")
 
   # TODO check that the snp is saved (it should be), and let the user know what the new
@@ -154,3 +163,13 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE,
   )
 }
 
+
+add_missing_cols <- function (x, y){
+  missing_cols <- names(y)[!names(y) %in% names(x)]
+  if (length(missing_cols)>0){
+    for (col_i in missing_cols){
+      x[col_i]<-NA
+    }
+  }
+  x
+}
diff --git a/R/snp_king.R b/R/snp_king.R
@@ -2,7 +2,6 @@
 #'
 #' This function computes the KING-robust estimator of kinship.
 #'
-#' The results should be equivalent to PLINK, but that SHOULD be tested!!!!
 #' The last step is not optimised yet, as it does the division of the num by the
 #' den all in memory (on my TODO list...).
 #'

diff --git a/R/vcf_to_fbm.R b/R/vcf_to_fbm.R
@@ -114,12 +114,13 @@ vcf_to_fbm <- function(
 
     # create loci table
     loci <- rbind(loci, tibble(
-                   chromosome = vcfR::getCHROM(temp_vcf)[bi],
-                   marker.ID = vcfR::getID(temp_vcf)[bi],
+                   chromosome = unname(vcfR::getCHROM(temp_vcf)[bi]),
+                   marker.ID = unname(vcfR::getID(temp_vcf)[bi]), # remove names as it does have ID as a name
                    genetic.dist = 0,
                    physical.pos = vcfR::getPOS(temp_vcf)[bi],
-                   allele1 = vcfR::getALT(temp_vcf)[bi],
-                   allele2 = vcfR::getREF(temp_vcf)[bi]))
+                   allele1 = unname(vcfR::getALT(temp_vcf)[bi]),
+                   allele2 = unname(vcfR::getREF(temp_vcf)[bi])))
+
   }
   # save it
   file_backed_matrix$save()

diff --git a/README.md b/README.md
@@ -8,6 +8,9 @@ The goal of `tidypopgen` is to provide a tidy grammar of population genetics, fa
 the manipulation and analysis of genetic data. Currently, it is focussed on biallelic single nucleotide
 polymorphisms (SNPs).
 
+We are making available a *preview* version of the package. Everything should work,
+but be vigilant as not everything has been tested extensively.
+
 ## Installation
 
 You need `devtools` to install `tidypopgen`. If you haven't done so already, install it with:
@@ -24,7 +27,7 @@ devtools::install_github("EvolEcolGroup/tidypopgen")
 
 There are a several vignettes designed to teach you how to use `tidypopgen`. 
 The
-'overview' vignette explains how the data structures are designes, and provides a illustration
+'overview' vignette explains how the data structures are designed, and provides a illustration
 of the grammar used to manipulate individuals and loci. 
 
 The 'example workflow' vignette provides a fully annotated example of how to 
@@ -36,3 +39,14 @@ running a fully QC of a dataset before analysis.
 Finally, we provide a 'PLINK cheatsheet' aimed at translating common tasks
 performed in PLINK into `tidypopgen` commands.
 
+## When something does not work
+
+This is a preview version of `tidypopgen`. Everything should work, but not all
+elements have been extensively tested. If something does not work, check the [issues on
+GitHub](https://github.com/EvolEcolGroup/pastclim/issues) to see whether
+the problem has already been reported. If not, feel free to create an
+new issue. Please make sure you have updated to the latest version of
+`pastclim` on CRAN, as well as updating all other packages on your
+system, and provide [a reproducible
+example](https://reprex.tidyverse.org/)
+for the developers to investigate the problem.
diff --git a/data-raw/adegenet_functions/converted/example_clustering_and_dapc.Rmd b/data-raw/adegenet_functions/converted/example_clustering_and_dapc.Rmd
@@ -55,7 +55,7 @@ The ids from the VCF are in a different format than the ones we just got from
 the pop csv. We need a bit of string wrangling, but it looks easy, we just need
 to remove "punc_":
 
-Let us simplify the ids, which wll have a "punc_" prefix
+Let us simplify the ids, which will have a "punc_" prefix
 ```{r}
 anole_gt <- anole_gt %>% mutate(id = gsub('punc_',"",.data$id,))
 anole_gt

diff --git a/data-raw/old_functions_to_scavenge/gt_dapc.R b/data-raw/old_functions_to_scavenge/gt_dapc.R
@@ -1,9 +1,9 @@
 #' Discriminant Analysis of Principal Components for gen_tibble
 #'
 #' This function implements the Discriminant Analysis of Principal Components
-#' (DAPC, Jombart et al. 2010). This method descibes the diversity between
+#' (DAPC, Jombart et al. 2010). This method describes the diversity between
 #' pre-defined groups. When groups are unknown, use [gt_cluster_pca()] to
-#' infer genetic clusters. See 'details' section for a succint
+#' infer genetic clusters. See 'details' section for a succinct
 #' description of the method, and the vignette in the package `adegenet`
 #' ("adegenet-dapc") for a
 #' tutorial. This function returns objects of class [`adegenet::dapc`] which are compatible

diff --git a/data-raw/old_functions_to_scavenge/gt_pca_best_k.R b/data-raw/old_functions_to_scavenge/gt_pca_best_k.R
@@ -27,11 +27,11 @@
 #' and validate the programmatic selection. The criteria available in this
 #' function are:
 #' - "diffNgroup": differences between successive values of the summary
-#'  statistics (by default, BIC) are splitted into two groups using a Ward's
+#'  statistics (by default, BIC) are split into two groups using a Ward's
 #'  clustering method (see ?hclust), to differentiate sharp decrease from mild
 #'  decreases or increases. The retained K is the one before the first group
 #'  switch. This criterion appears to work well for island/hierarchical models, and decently
-#'  for isolation by distance models, albeit with some unstability. It can be
+#'  for isolation by distance models, albeit with some instability. It can be
 #'  confounded by an initial, very sharp decrease of the test statistics. IF
 #'  UNSURE ABOUT THE CRITERION TO USE, USE THIS ONE.
 #' - "min": the model with the minimum summary statistics (as specified by

diff --git a/inst/extdata/pop_a.geno b/inst/extdata/pop_a.geno
diff --git a/inst/extdata/pop_a.ind b/inst/extdata/pop_a.ind
@@ -0,0 +1,5 @@
+   pop_a:GRC14300079 M        ???
+   pop_a:GRC14300142 M        ???
+   pop_a:GRC14300159 M        ???
+   pop_a:GRC14300286 M        ???
+   pop_a:GRC14300305 M        ???
diff --git a/inst/extdata/pop_a.snp b/inst/extdata/pop_a.snp
@@ -0,0 +1,16 @@
+           rs4477212     1        0.000822           82154 A X
+           rs3094315     1        0.007526          752566 A G
+           rs3131972     1        0.007527          752721 G A
+          rs12124819     1        0.007765          776546 A X
+          rs11240777     1        0.007990          798959 G A
+           rs1110052     1        0.008736          873558 G T
+           rs9697457     1        0.009343          934345 G X
+           rs6657048     1        0.009576          957640 C X
+           rs2488991     1        0.009944          994391 T X
+            rs307354     1        0.012645         1264539 C G
+           rs1240719     1        0.013469         1346905 T A
+           rs2862633     2        0.619744        61974443 G X
+          rs28569024     2        1.390088       139008811 T X
+          rs10106770     2        2.358328       235832763 G A
+          rs11942835     3        1.559137       155913651 T C
+           rs5945676    23        0.514331        51433071 T X
diff --git a/inst/extdata/pop_b.geno b/inst/extdata/pop_b.geno
diff --git a/inst/extdata/pop_b.ind b/inst/extdata/pop_b.ind
@@ -0,0 +1,3 @@
+         pop_b:SA073 F        ???
+        pop_b:SA1021 F        ???
+        pop_b:SA1008 M        ???