Merge branch 'main' into memory_use

EvolEcolGroup · Dec 2, 2024 · 3ec8a8c · 3ec8a8c
2 parents 19d64c4 + a96ff58
commit 3ec8a8c
Show file tree

Hide file tree

Showing 37 changed files with 436 additions and 877 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tidypopgen
 Title: Tidy Population Genetics
-Version: 0.0.0.9019
+Version: 0.0.0.9020
 Authors@R: 
     c(person("Evie", "Carter", role = c("aut")),
     person("Andrea", "Manica", email = "[email protected]", 

diff --git a/R/filter_high_relatedness.R b/R/filter_high_relatedness.R
@@ -72,7 +72,6 @@ filter_high_relatedness <-
     # for pairs with higher than kings_threshold relatedness
     # the individual with higher average relatedness is removed
     for (i in 1:(var_num - 1)) {
-      #browser()
       if (!any(matrix2[!is.na(matrix2)] > kings_threshold)) {
         if (verbose) {
           message("All correlations <=", kings_threshold, "\n")

diff --git a/R/gen_tibble.R b/R/gen_tibble.R
@@ -3,11 +3,15 @@
 #' A `gen_tibble` stores genotypes for individuals in a tidy format. DESCRIBE
 #' here the format
 #'
-#' - *VCF* files: the fast `cpp` parser is used by default. It attempts to establish
-#' ploidy from the first variant; if that variant is found in a sex chromosome (or mtDNA),
-#' it will fail. To use the fast parser, change the order of variants so that the first chromosome is an
-#' autosome using a tool such as `vcftools`. Alternatively, the slower but more robust
-#' `vcfR` parser should be used instead.
+#'- *VCF* files: the fast `cpp` parser is used by default. Both `cpp` and `vcfR` parsers
+#' attempt to establish ploidy from the first variant; if that variant is found in a
+#' sex chromosome (or mtDNA), the parser will fail with 'Error: a genotype has more
+#' than max_ploidy alleles...'. To successful import such a *VCF*, change the order of variants
+#' so that the first chromosome is an autosome using a tool such as `vcftools`.
+#' Currently, only biallelic SNPs are supported. If haploid variants (e.g. sex
+#' chromosomes) are included in the *VCF*, they are not transformed into homozygous
+#' calls. Instead, reference alleles will be counted as 0 and alternative alleles
+#' will be counted as 1.
 #'
 #' - *packedancestry* files: When loading *packedancestry* files, missing alleles will be converted from
 #' 'X' to NA
@@ -19,9 +23,9 @@
 #' - a string giving the path to a RDS file storing a `bigSNP` object from
 #' the `bigsnpr` package (usually created with [bigsnpr::snp_readBed()])
 #' - a string giving the path to a vcf file. Note that we currently read the whole
-#' vcf in memory with `vcfR`, so only smallish vcf can be imported. Only biallelic
+#' vcf in memory with `vcfR`, so only smallish *VCF* can be imported. Only biallelic
 #' SNPs will be considered.
-#' - a string giving the path to a packedancestry .geno file. The associated
+#' - a string giving the path to a *packedancestry* .geno file. The associated
 #' .ind and .snp files are expected to be in the same directory and share the
 #' same file name prefix.
 #' - a genotype matrix of dosages (0, 1, 2, NA) giving the dosage of the alternate
@@ -39,9 +43,9 @@
 #' if `x` is a vcf or packedancestry file)
 #' @param ... if `x` is the name of a vcf file, additional arguments
 #' passed to [vcfR::read.vcfR()]. Otherwise, unused.
-#' @param parser the name of the parser used for VCF, either "cpp" to use
+#' @param parser the name of the parser used for *VCF*, either "cpp" to use
 #' a fast C++ parser, or "vcfR" to use the R package `vcfR`. The latter is slower
-#' but more robust; if "cpp" gives error, try using "vcfR" in case your VCF has
+#' but more robust; if "cpp" gives error, try using "vcfR" in case your *VCF* has
 #' an unusual structure.
 #' @param n_cores the number of cores to use for parallel processing
 #' @param valid_alleles a vector of valid allele values; it defaults to 'A','T',
@@ -52,10 +56,10 @@
 #' @param backingfile the path, including the file name without extension,
 #' for backing files used to store the data (they will be given a .bk
 #' and .RDS automatically). This is not needed if `x` is already an .RDS file.
-#' If `x` is a .BED file and `backingfile` is left NULL, the backing file will
+#' If `x` is a .BED or a *VCF* file and `backingfile` is left NULL, the backing file will
 #' be saved in the same directory as the
 #' bed file, using the same file name but with a different file type (.bk rather
-#' than .bed). The same logic applies to .vcf files. If `x` is a genotype matrix and `backingfile` is NULL, then a
+#' than .bed). If `x` is a genotype matrix and `backingfile` is NULL, then a
 #' temporary file will be created (but note that R will delete it at the end of
 #' the session!)
 #' @param quiet provide information on the files used to store the data
@@ -135,9 +139,8 @@ gen_tibble.character <-
     stop("file_path should be pointing to a either a PLINK .bed or .ped file, a bigSNP .rds file or a VCF .vcf or .vcf.gz file")
   }
 
-  loci <- show_loci(x_gt)
-  new_loci <- add_chromosome_as_int(loci)
-  show_loci(x_gt) <- new_loci
+  # create a chr_int column
+  show_loci(x_gt)$chr_int <- cast_chromosome_to_int(show_loci(x_gt)$chromosome)
 
   file_in_use <- gt_save_light(x_gt, quiet = quiet)
   return(x_gt)
@@ -311,9 +314,8 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ...,
                         remove_on_fail = TRUE)
   show_loci(new_gen_tbl) <- harmonise_missing_values(show_loci(new_gen_tbl), missing_alleles = missing_alleles)
 
-  loci <- show_loci(new_gen_tbl)
-  new_loci <- add_chromosome_as_int(loci)
-  show_loci(new_gen_tbl) <- new_loci
+  # create a chr_int column
+  show_loci(new_gen_tbl)$chr_int <- cast_chromosome_to_int(show_loci(new_gen_tbl)$chromosome)
 
   files_in_use <- gt_save(new_gen_tbl, quiet = quiet)
   return(new_gen_tbl)
@@ -570,39 +572,23 @@ change_duplicated_file_name <- function(file){
 }
 
 # adding a chr_int column
-add_chromosome_as_int <- function(loci){
-
-
-  if(is.integer(loci$chromosome)){
-    loci$chr_int <- loci$chromosome
-
-  } else if(is.numeric(loci$chromosome)){
-    non_na_id <- !is.na(loci$chromosome)
-    chr_int <- as.integer(loci$chromosome[non_na_id])
-
-    loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
-    loci$chr_int[non_na_id] <- chr_int
-
-  } else if(is.character(loci$chromosome)){
-    non_na_id <- !is.na(loci$chromosome)
-    chr_factor <- as.factor(loci$chromosome[non_na_id])
-    chr_int <- as.integer(chr_factor)
-
-    loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
-    loci$chr_int[non_na_id] <- chr_int
-
-
-  } else if(is.factor(loci$chromosome)){
-    non_na_id <- !is.na(loci$chromosome)
-    chr_int <- as.integer(loci$chromosome[non_na_id])
-
-    loci$chr_int <- rep(NA_integer_, length(loci$chromosome))
-    loci$chr_int[non_na_id] <- chr_int
-
+cast_chromosome_to_int <- function(chromosome){
+  # if chromosome is an factor, then cast it to character
+  if (is.factor(chromosome)){
+    chromosome <- as.character(chromosome)
+  }
+  # if chromosome is a character, then cast it to integer
+  if (is.character(chromosome)){
+    chromosome <- tryCatch(as.integer(chromosome), warning = function(e) {as.integer(as.factor(chromosome))})
+  }
+  if (is.numeric(chromosome)){
+    chromosome <- as.integer(chromosome)
+  }
+  if (is.integer(chromosome)){
+    return(chromosome)
   } else {
-
     stop("Chromosome column should be integer, character, or factor")
   }
-  return(loci)
+
 }
 
diff --git a/R/gen_tibble_packedancestry.R b/R/gen_tibble_packedancestry.R
@@ -55,7 +55,6 @@ gen_tibble_packedancestry <- function(x, ...,
                                               inds = indiv_table$id[i],
                                             ..., verbose = !quiet)
     res$geno[is.na(res$geno)]<-3
-    #browser()
     # now insert the genotypes in the FBM
     file_backed_matrix[
       i,

diff --git a/R/gt_pca.R b/R/gt_pca.R
@@ -14,5 +14,9 @@
 #' NOTE: using gt_pca_autoSVD with a small dataset will likely cause an error,
 #' see man page for details.
 #'
+#' NOTE: monomorphic markers must be removed before PCA is computed. The error
+#' message 'Error: some variables have zero scaling; remove them before attempting
+#' to scale.' indicates that monomorphic markers are present.
+#'
 #' @name gt_pca
 NULL
diff --git a/R/gt_pca_autoSVD.R b/R/gt_pca_autoSVD.R
@@ -50,6 +50,10 @@
 #'
 #' Note: rather than accessing these elements directly, it is better to use
 #' `tidy` and `augment`. See [`gt_pca_tidiers`].
+#' Note: If you encounter 'Error in rollmean(): Parameter 'size' is too large.'
+#' roll_size is exceeding the number of variants on at least one of your chromosomes.
+#' If you have pre-specified roll_size, you will need to reduce this parameter.
+#' If not, try specifying a reduced 'roll_size' to avoid this error.
 #'
 #' @export
 
@@ -72,7 +76,13 @@ gt_pca_autoSVD <- function(x, k = 10,
     gt_set_imputed(x, set = TRUE)
     on.exit(gt_set_imputed(x, set = FALSE))
   }
-  #browser()
+
+  if(is.null(roll_size)){
+    message("If you encounter 'Error in rollmean(): Parameter 'size' is too large.'
+          roll_size exceeds the number of variants on at least one of your chromosomes.
+          Try reducing 'roll_size' to avoid this error.")
+  }
+
   X <- attr(x$genotypes,"bigsnp") # convenient pointer
   x_ind_col <- show_loci(x)$big_index
   x_ind_row <- vctrs::vec_data(x$genotypes)
@@ -110,7 +120,6 @@ gt_pca_autoSVD <- function(x, k = 10,
                       verbose = verbose)
   # add names to the scores (to match them to data later)
   rownames(this_svd$u)<-x$id
-  #browser()
   this_svd$method <- "autoSVD"
   this_svd$call <- match.call()
   # subset the loci table to have only the snps of interest

diff --git a/R/gt_save.R b/R/gt_save.R
@@ -44,7 +44,7 @@ gt_save <- function(x, file_name = NULL, quiet = FALSE) {
   return(c(file_name,gt_get_file_names(x)))
 }
 
-#' a light version of gt_save that does not resave the RDS, to be used internally
+#' a light version of gt_save that does not resave the bigSNP RDS or backing file, to be used internally
 #' when creating a gen_tibble (if we have just created it, there is not need
 #' to resave it)
 #' @param x a [`gen_tibble`]

diff --git a/R/loci_ld_clump.R b/R/loci_ld_clump.R
@@ -80,10 +80,16 @@ loci_ld_clump.vctrs_bigSNP <- function(.x,
   # rows (individuals) that we want to use
   rows_to_keep <- vctrs::vec_data(.x)
   if (use_positions){
-    .positions <- show_loci(.x)$position
+    #.positions <- show_loci(.x)$position
+    #.positions <- attr(.x,"bigsnp")$map$physical.pos
+    .positions <- rep(NA, nrow(attr(.x,"bigsnp")$map))
+    .positions[show_loci(.x)$big_index] <- show_loci(.x)$position
   } else {
     .positions <- NULL
   }
+  # create a chromosome vector (fill gaps between bigsnpr and show_loci)
+  .chromosome <- rep(2147483647L, nrow(attr(.x,"bigsnp")$map))
+  .chromosome[show_loci(.x)$big_index] <- show_loci(.x)$chr_int
   # now figure out if we have any snp which have already been removed
   # those will go into `exclude`
   loci_not_in_tibble <- seq_len(nrow(attr(.x,"bigsnp")$map))[!seq_len(nrow(attr(.x,"bigsnp")$map)) %in%
@@ -95,7 +101,10 @@ loci_ld_clump.vctrs_bigSNP <- function(.x,
 
   # as long as we have more than one individual
   snp_clump_ids <- bigsnpr::snp_clumping(G = attr(.x,"bigsnp")$genotypes,
-                        infos.chr = show_loci(.x)$chr_int,
+                        #infos.chr = show_loci(.x)$chr_int,
+                        # TEMP HACK using the info from the bigsnpr object
+                        #infos.chr = cast_chromosome_to_int(attr(.x,"bigsnp")$map$chromosome),
+                        infos.chr = .chromosome,
                         ind.row = vctrs::vec_data(.x),
                         S = S,
                         thr.r2 = thr_r2,

diff --git a/R/pairwise_king.R b/R/pairwise_king.R
@@ -2,7 +2,8 @@
 #'
 #' This function computes the KING-robust estimator of kinship.
 #'
-#' Note that monomorphic sites are currently considered. What does PLINK do???
+#' Note that monomorphic sites are currently considered. Remove monomorphic sites
+#' before running pairwise_king if this is a concern.
 #' @param x a `gen_tibble` object.
 #' @param as_matrix boolean, determining whether the results should be a square symmetrical matrix (TRUE),
 #' or a tidied tibble (FALSE, the default)

diff --git a/R/rbind_dry_run.R b/R/rbind_dry_run.R
@@ -54,7 +54,6 @@ rbind_dry_run <- function(ref, target, use_position = FALSE, flip_strand = FALSE
   # rename the alleles
   ref_df <- ref_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
   target_df <- target_df %>% rename(allele_1 = "allele_alt", allele_2 = "allele_ref")
-  #browser()
   rbind_dry_run_df(ref_df = ref_df,
                    target_df = target_df,
                    flip_strand = flip_strand,
@@ -115,8 +114,6 @@ rbind_dry_run_df <- function(ref_df, target_df,  flip_strand, quiet){
     to_flip <- to_flip & !ambiguous_sub
     to_swap <- to_swap & !ambiguous_sub
   }
- # browser()
-
   # now we create the two reporting data.frames
   # note that they include all loci (including the ones we dropped because they
   # did not exist in one of the datasets)
@@ -153,7 +150,6 @@ rbind_dry_run_df <- function(ref_df, target_df,  flip_strand, quiet){
 
 
 resolve_missing_alleles <- function(missing_table, other_table){
-  #browser()
   missing_replacements <- rep(NA, nrow(missing_table))
   for (i_row in which(missing_table$allele_1=="0")){
     # get non-missing allele

diff --git a/R/rbind_gen_tibble.R b/R/rbind_gen_tibble.R
@@ -154,7 +154,6 @@ rbind.gen_tbl <- function(..., as_is = FALSE, flip_strand = FALSE, use_position
   vctrs::vec_data(ref$genotypes)
   #and finally append the loci table
   indivs_with_big_names <- c(names(ref$genotypes),names(target$genotypes))
-  #browser()
   #new_ref_loci_tbl$big_index<-match(new_ref_loci_tbl$name,merged_snp$map$marker.ID) # TODO check that this is the correct order!!!!
   new_ref_loci_tbl$big_index<-1:nrow(new_ref_loci_tbl) # by default, this should just be a subset in the same order as the reference
 

diff --git a/R/snp_king.R b/R/snp_king.R
@@ -2,9 +2,6 @@
 #'
 #' This function computes the KING-robust estimator of kinship.
 #'
-#' The last step is not optimised yet, as it does the division of the num by the
-#' den all in memory (on my TODO list...).
-#'
 #' @param X a [bigstatsr::FBM.code256] matrix (as found in the `genotypes`
 #' slot of a [bigsnpr::bigSNP] object).
 #' @param ind.row An optional vector of the row indices that are used.

diff --git a/R/vcf_to_fbm_cpp.R b/R/vcf_to_fbm_cpp.R
@@ -86,11 +86,6 @@ vcf_to_fbm_cpp <- function(
   # add the new columns
   file_backed_matrix$add_columns(chunk_info$num_loci)
   # fill them in
-  # file_backed_matrix[
-  #   ,
-  #   index_start:(index_start+chunk_info$num_loci-1)
-  # ] <- genotypes_matrix[,1:chunk_info$num_loci]
-
   write_to_FBM(file_backed_matrix,
                allele_counts = genotypes_matrix,
                col_start = index_start-1,

diff --git a/R/vcf_to_fbm_vcfR.R b/R/vcf_to_fbm_vcfR.R
@@ -153,7 +153,12 @@ poly_indiv_dosage <- function (x, max_ploidy){
 # get dosages for a genotype x and return them as raw for inclusion in the filebacked matrix
 poly_genotype_dosage <- function (x, max_ploidy){
   if (!is.na(x[1]) && x[1]!="."){
-    as.raw(sum(as.numeric(x)))
+    dosage <- sum(as.numeric(x))
+    if (dosage<(max_ploidy+1)){
+      return(as.raw(dosage))
+    } else{
+      stop("a genotype has more than max_ploidy alleles. We estimate max_ploidy from the first variant in the vcf file, make sure that variant is representative of ploidy (e.g. it is not on a sex chromosome).")
+    }
   } else {
     return(as.raw(max_ploidy+1))
   }

diff --git a/data-raw/ideas/allele_sharing.R b/data-raw/ideas/allele_sharing.R