From c501b81826b59064a54a08f659cef978c00ddbbd Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Mon, 9 Dec 2024 12:20:32 +0000 Subject: [PATCH 01/16] Remove code repetition in change_duplicated_filename, --- R/gen_tibble.R | 26 +------------------------- tests/testthat/test_gen_tibble.R | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 40 deletions(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index 08695e22..af2f0807 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -519,7 +519,7 @@ change_duplicated_file_name <- function(file){ bk <- paste0(file, ".bk") rds <- paste0(file, ".rds") - if(file.exists(bk) && !file.exists(rds)){ + if(file.exists(bk) | file.exists(rds)){ version <- 2 @@ -531,29 +531,6 @@ change_duplicated_file_name <- function(file){ existing_files <- list.files(dirname(bk), pattern = paste0("^", base_name, "_v\\d+\\.bk$")) - if (length(existing_files) > 0) { - versions <- sub(version_pattern, "\\1", existing_files) - versions <- as.numeric(versions) - if (!any(is.na(versions))) { - version <- max(versions) + 1 - } - } - - new_file <- paste0(file,"_v",version) - - return(new_file) - } else if (file.exists(bk) && file.exists(rds)){ - - version <- 2 - - base_name <- basename(file) - - version_pattern <- paste0(base_name, "_v(\\d+)\\.bk$") - - # read existing files to check for existing versions - existing_files <- list.files(dirname(bk), pattern = paste0("^", base_name, "_v\\d+\\.bk$")) - - if (length(existing_files) > 0) { versions <- sub(version_pattern, "\\1", existing_files) versions <- as.numeric(versions) @@ -566,7 +543,6 @@ change_duplicated_file_name <- function(file){ return(new_file) } - return(file) } diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 6ff73210..26ef7040 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -484,24 +484,24 @@ test_that("check summary stats are the same for gen_tibbles read in different wa }) -test_indiv_meta <- data.frame (id=c("a","b","c"), - population = c("pop1","pop1","pop2")) -test_genotypes <- rbind(c(1,1,0,1,1,0), - c(2,1,0,0,0,0), - c(2,2,0,0,1,1)) -test_loci <- data.frame(name=paste0("rs",1:6), - chromosome=paste0("chr",c(1,1,1,1,2,2)), - position=as.integer(c(3,5,65,343,23,456)), - genetic_dist = as.double(rep(0,6)), - allele_ref = c("A","T","C","G","C","T"), - allele_alt = c("T","C", NA,"C","G","A")) +test_that("versioning if .bk already exists",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) -test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = tempfile()) -test_that("versioning if .bk already exists",{ + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = tempfile()) # get the gt filenames files <- gt_get_file_names(test_gt) From dbc0c6a2f9b22865ccba6bfb64f40cc7a188b97d Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Mon, 9 Dec 2024 12:20:48 +0000 Subject: [PATCH 02/16] documentation --- man/gt_update_backing_file.Rd | 3 ++- man/pairwise_ibs.Rd | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/man/gt_update_backing_file.Rd b/man/gt_update_backing_file.Rd index 0441f8ed..dc505c2f 100644 --- a/man/gt_update_backing_file.Rd +++ b/man/gt_update_backing_file.Rd @@ -29,5 +29,6 @@ a \code{\link{gen_tibble}} with a backing file (i.e. a new File Backed Matrix) \description{ This functions forces a re-write of the file backing matrix to match the \code{\link{gen_tibble}}. Individuals and loci are subsetted and reordered according to -the current state of the \code{gen_tibble}. +the current state of the \code{gen_tibble}. Tests for this function are in +test_gt_order_loci.R } diff --git a/man/pairwise_ibs.Rd b/man/pairwise_ibs.Rd index 4a639f30..a8c9badc 100644 --- a/man/pairwise_ibs.Rd +++ b/man/pairwise_ibs.Rd @@ -32,6 +32,6 @@ and one of number of valid alleles (i.e. 2\emph{n_loci - 2}missing_loci) This function computes the IBS matrix. } \details{ -Note that monomorphic sites are currently counted. Should we filter -them beforehand? What does plink do? +Note that monomorphic sites are currently considered. Remove monomorphic sites +before running pairwise_king if this is a concern. } From f8def5ae4434a6a022d681bfcf01d70879870c4f Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Mon, 9 Dec 2024 12:22:05 +0000 Subject: [PATCH 03/16] update website functions --- _pkgdown.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 1acb8bc6..ef4893ea 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -13,6 +13,7 @@ reference: - gt_save - gt_load - gt_get_file_names + - gt_update_backingfile - title: "Show" desc: "Functions to view loci and genotype information stored in a `gen_tibble`." contents: @@ -56,6 +57,8 @@ reference: desc: "Functions operating across loci." contents: - starts_with("loci_") + - gt_order_loci + - is_loci_table_ordered - title: "Individual" desc: "Functions operating across individuals." contents: From 97c5e1ef3b8d1909fda107aa3b65ec313952e0cf Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Tue, 10 Dec 2024 06:55:16 +0000 Subject: [PATCH 04/16] Update versioning and add test --- R/gen_tibble.R | 28 +++++++---- tests/testthat/test_gen_tibble.R | 79 +++++++++++++++++++++----------- 2 files changed, 70 insertions(+), 37 deletions(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index af2f0807..5068655b 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -520,26 +520,34 @@ change_duplicated_file_name <- function(file){ rds <- paste0(file, ".rds") if(file.exists(bk) | file.exists(rds)){ - version <- 2 - - base_name <- basename(file) + # extract the base name and version number + base_name_pattern <- "^(.*)_v(\\d+)$" + # check for any matches + matches <- regmatches(basename(file), regexec(base_name_pattern, basename(file))) + + if (length(matches[[1]]) > 0) { + # Extract base name without version and current version number + base_name <- matches[[1]][2] # Part before "_v" + current_version <- as.numeric(matches[[1]][3]) # extract current version number + } else { + base_name <- basename(file) # Use the full name if there's no "_v" suffix + current_version <- 1 + } version_pattern <- paste0(base_name, "_v(\\d+)\\.bk$") - - # read existing files to check for existing versions - existing_files <- list.files(dirname(bk), pattern = paste0("^", base_name, "_v\\d+\\.bk$")) - + # read files to check for existing versions + existing_files <- list.files(dirname(bk), pattern = paste0("^", base_name, "_v\\d+\\.bk$")) if (length(existing_files) > 0) { versions <- sub(version_pattern, "\\1", existing_files) versions <- as.numeric(versions) if (!any(is.na(versions))) { - version <- max(versions) + 1 + version <- max(versions) + 1 # add 1 to the version number } } - - new_file <- paste0(file,"_v",version) + # create new file path + new_file <- paste0(dirname(file), "/", base_name, "_v", version) return(new_file) } diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 26ef7040..8b564fb6 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -69,9 +69,6 @@ test_that("gen_tibble catches invalid alleles",{ loci = test_loci_wrong, valid_alleles = c("A","C","T","G","0"), quiet = TRUE), "can not be a valid allele") - - - }) @@ -91,8 +88,6 @@ test_that("if order of loci is changed, order of genotypes also changes",{ #check that genotypes are now reordered according to random order expect_equal(pop_b_gen_reordered, show_genotypes(pop_b)) - - }) test_that("gen_tibble does not accept character matrix",{ @@ -485,7 +480,6 @@ test_that("check summary stats are the same for gen_tibbles read in different wa test_that("versioning if .bk already exists",{ - test_indiv_meta <- data.frame (id=c("a","b","c"), population = c("pop1","pop1","pop2")) test_genotypes <- rbind(c(1,1,0,1,1,0), @@ -497,20 +491,16 @@ test_that("versioning if .bk already exists",{ genetic_dist = as.double(rep(0,6)), allele_ref = c("A","T","C","G","C","T"), allele_alt = c("T","C", NA,"C","G","A")) - - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, backingfile = tempfile()) # get the gt filenames files <- gt_get_file_names(test_gt) - # remove the .rds expect_true(file.remove(files[1])) - - file <- gsub(".bk","",files[2],) - + # remove extension + file <- gsub(".bk","",files[2]) # create gt using the same backingfile name test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, @@ -537,6 +527,44 @@ test_that("versioning if .bk already exists",{ }) +test_that("versioning updates correctly through gt_order_loci",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = tempfile()) + + # get the gt filenames + files <- gt_get_file_names(test_gt) + # remove extension + file <- gsub(".bk","",files[2]) + # create gt using the same backingfile name + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = file) + + # get new file names + new_files <- gt_get_file_names(test_gt) + + # mess with the loci table + test_gt <- test_gt %>% select_loci(c(2,4,5,1,6)) + test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) + gt_save(test_gt, quiet = TRUE) + + # check file names + expect_equal(gt_get_file_names(test_gt)[1], paste0(file,"_v3.rds")) + expect_equal(gt_get_file_names(test_gt)[2], paste0(file,"_v3.bk")) +}) + test_that("chr_int is always an integer",{ @@ -579,19 +607,19 @@ test_that("chr_int is always an integer",{ }) -test_indiv_meta <- data.frame (id=c("a","b","c")) -test_genotypes <- rbind(c(1,1,0,1,1,0), - c(2,1,0,0,0,0), - c(2,2,0,0,1,1)) -test_loci <- data.frame(name=paste0("rs",1:6), - chromosome=paste0("chr",c(1,1,1,1,2,2)), - position=as.integer(c(3,5,65,343,23,456)), - genetic_dist = as.double(rep(0,6)), - allele_ref = c("A","T","C","G","C","T"), - allele_alt = c("T","C", NA,"C","G","A")) -test_that("gt without population is valid",{ +test_that("gt without population is valid",{ + test_indiv_meta <- data.frame (id=c("a","b","c")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, backingfile = tempfile()) @@ -628,8 +656,7 @@ test_that("additional vcf tests with larger file",{ anole_gt2 <- gen_tibble(vcf_path, quiet = TRUE, parser = "cpp", backingfile = tempfile("anolis_"), chunk_size = 1000, n_cores = 2) expect_true(all.equal(show_genotypes(anole_gt2),show_genotypes(anole_gt_vcfr))) -} -) +}) test_that("vcf's with haploid markers",{ @@ -691,8 +718,6 @@ test_that("chr_int is correct",{ test_that("gen_tibble family.ID from vcf",{ - - # If the gen_tibble has been read in from vcf format, family.ID in the resulting # plink files will be the same as sample.ID. From e8b8a3c9b9f720f0c75cece846b957db8a79e598 Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Tue, 10 Dec 2024 07:13:05 +0000 Subject: [PATCH 05/16] Move test to test_gt_as_plink --- tests/testthat/test_gen_tibble.R | 40 ------------------------------- tests/testthat/test_gt_as_plink.R | 32 +++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 40 deletions(-) diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index 8b564fb6..c91f1b08 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -716,46 +716,6 @@ test_that("chr_int is correct",{ }) - -test_that("gen_tibble family.ID from vcf",{ - # If the gen_tibble has been read in from vcf format, family.ID in the resulting - # plink files will be the same as sample.ID. - - - #### With vcfr parser - vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") - pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), - parser="vcfR") - - # write vcf_path using gt_as_plink - pop_b_bed <- gt_as_plink(pop_b_vcf_gt, tempfile()) - - # substitute ".bed" for ".fam" in pop_b_bed - fam_path <- gsub(".bed",".fam",pop_b_bed) - - # read in the .fam file - fam <- read.table(fam_path, header = FALSE, stringsAsFactors = FALSE) - - expect_true(all(fam$V1 == pop_b_vcf_gt$id)) - expect_true(all(fam$V1 == fam$V2)) - - #### With cpp parser - vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") - pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), - parser="cpp") - - # write vcf_path using gt_as_plink - pop_b_bed <- gt_as_plink(pop_b_vcf_gt, tempfile()) - - # substitute ".bed" for ".fam" in pop_b_bed - fam_path <- gsub(".bed",".fam",pop_b_bed) - - # read in the .fam file - fam <- read.table(fam_path, header = FALSE, stringsAsFactors = FALSE) - - expect_true(all(fam$V1 == pop_b_vcf_gt$id)) - expect_true(all(fam$V1 == fam$V2)) -}) # Windows prevents the deletion of the backing file. It's something to do with the memory mapping # library used by bigsnpr # test_that("on error, we remove the old files",{ diff --git a/tests/testthat/test_gt_as_plink.R b/tests/testthat/test_gt_as_plink.R index 6605e09c..2b6f3d0f 100644 --- a/tests/testthat/test_gt_as_plink.R +++ b/tests/testthat/test_gt_as_plink.R @@ -89,3 +89,35 @@ test_that("test for overwriting files",{ }) +test_that("family.ID equals sample.ID from vcf",{ + # If the gen_tibble has been read in from vcf format, family.ID in the resulting + # plink files will be the same as sample.ID. + + #### With vcfr parser + vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") + pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + parser="vcfR") + # write vcf_path using gt_as_plink + pop_b_bed <- gt_as_plink(pop_b_vcf_gt, tempfile()) + # substitute ".bed" for ".fam" + fam_path <- gsub(".bed",".fam",pop_b_bed) + # read in the .fam file + fam <- read.table(fam_path, header = FALSE, stringsAsFactors = FALSE) + # check that family.ID is the same as sample.ID + expect_true(all(fam$V1 == pop_b_vcf_gt$id)) + expect_true(all(fam$V1 == fam$V2)) + + #### With cpp parser + vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") + pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + parser="cpp") + # write vcf_path using gt_as_plink + pop_b_bed <- gt_as_plink(pop_b_vcf_gt, tempfile()) + # substitute ".bed" for ".fam" + fam_path <- gsub(".bed",".fam",pop_b_bed) + # read in the .fam file + fam <- read.table(fam_path, header = FALSE, stringsAsFactors = FALSE) + # check that family.ID is the same as sample.ID + expect_true(all(fam$V1 == pop_b_vcf_gt$id)) + expect_true(all(fam$V1 == fam$V2)) +}) From 3fccdfe31177b6095228673241d430f1f6bb98fa Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 09:11:08 +0000 Subject: [PATCH 06/16] revert .yml changes to avoid conflict --- _pkgdown.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index ef4893ea..1acb8bc6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -13,7 +13,6 @@ reference: - gt_save - gt_load - gt_get_file_names - - gt_update_backingfile - title: "Show" desc: "Functions to view loci and genotype information stored in a `gen_tibble`." contents: @@ -57,8 +56,6 @@ reference: desc: "Functions operating across loci." contents: - starts_with("loci_") - - gt_order_loci - - is_loci_table_ordered - title: "Individual" desc: "Functions operating across individuals." contents: From 3f2e5b37fc72fdc2a228f1251824426d9c0c0a64 Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 09:57:15 +0000 Subject: [PATCH 07/16] add gt_as_plink test back in --- tests/testthat/test_gt_as_plink.R | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/testthat/test_gt_as_plink.R b/tests/testthat/test_gt_as_plink.R index 35ebfe35..e4e2e091 100644 --- a/tests/testthat/test_gt_as_plink.R +++ b/tests/testthat/test_gt_as_plink.R @@ -137,3 +137,35 @@ test_that("gt_as_plink can use chr_int",{ expect_equal(result, show_loci(test_gt_chr_int)$chromosome) }) +test_that("family.ID equals sample.ID from vcf",{ + # If the gen_tibble has been read in from vcf format, family.ID in the resulting + # plink files will be the same as sample.ID. + + #### With vcfr parser + vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") + pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + parser="vcfR") + # write vcf_path using gt_as_plink + pop_b_bed <- gt_as_plink(pop_b_vcf_gt, tempfile()) + # substitute ".bed" for ".fam" + fam_path <- gsub(".bed",".fam",pop_b_bed) + # read in the .fam file + fam <- read.table(fam_path, header = FALSE, stringsAsFactors = FALSE) + # check that family.ID is the same as sample.ID + expect_true(all(fam$V1 == pop_b_vcf_gt$id)) + expect_true(all(fam$V1 == fam$V2)) + + #### With cpp parser + vcf_path <- system.file("extdata/pop_b.vcf", package = "tidypopgen") + pop_b_vcf_gt <- gen_tibble(vcf_path, quiet=TRUE,backingfile = tempfile(), + parser="cpp") + # write vcf_path using gt_as_plink + pop_b_bed <- gt_as_plink(pop_b_vcf_gt, tempfile()) + # substitute ".bed" for ".fam" + fam_path <- gsub(".bed",".fam",pop_b_bed) + # read in the .fam file + fam <- read.table(fam_path, header = FALSE, stringsAsFactors = FALSE) + # check that family.ID is the same as sample.ID + expect_true(all(fam$V1 == pop_b_vcf_gt$id)) + expect_true(all(fam$V1 == fam$V2)) +}) From c4d68920c24191e87bd1cb6bd2a2512369ee825e Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 11:00:10 +0000 Subject: [PATCH 08/16] Moving version tests to try and trigger error --- tests/testthat/test_gen_tibble.R | 89 --------------------- tests/testthat/test_gen_tibble_versioning.R | 88 ++++++++++++++++++++ 2 files changed, 88 insertions(+), 89 deletions(-) create mode 100644 tests/testthat/test_gen_tibble_versioning.R diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index ee9f080e..ea33beb6 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -478,95 +478,6 @@ test_that("check summary stats are the same for gen_tibbles read in different wa }) - -test_that("versioning if .bk already exists",{ - test_indiv_meta <- data.frame (id=c("a","b","c"), - population = c("pop1","pop1","pop2")) - test_genotypes <- rbind(c(1,1,0,1,1,0), - c(2,1,0,0,0,0), - c(2,2,0,0,1,1)) - test_loci <- data.frame(name=paste0("rs",1:6), - chromosome=paste0("chr",c(1,1,1,1,2,2)), - position=as.integer(c(3,5,65,343,23,456)), - genetic_dist = as.double(rep(0,6)), - allele_ref = c("A","T","C","G","C","T"), - allele_alt = c("T","C", NA,"C","G","A")) - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = tempfile()) - - # get the gt filenames - files <- gt_get_file_names(test_gt) - # remove the .rds - expect_true(file.remove(files[1])) - - file <- gsub(".bk","",files[2]) - - # create gt using the same backingfile name - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = file) - - # get new file names - new_files <- gt_get_file_names(test_gt) - - # new_files has the same name as original file, plus a version extension - expect_equal(new_files[2], paste0(file,"_v2.bk")) - - # repeating the process creates another version - expect_true(file.remove(new_files[1])) - expect_false(file.exists(new_files[1])) - expect_true(file.exists(new_files[2])) - - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = file) - - new_version_files <- gt_get_file_names(test_gt) - - expect_equal(new_version_files[2], paste0(file,"_v3.bk")) - -}) - -test_that("versioning updates correctly through gt_order_loci",{ - test_indiv_meta <- data.frame (id=c("a","b","c"), - population = c("pop1","pop1","pop2")) - test_genotypes <- rbind(c(1,1,0,1,1,0), - c(2,1,0,0,0,0), - c(2,2,0,0,1,1)) - test_loci <- data.frame(name=paste0("rs",1:6), - chromosome=paste0("chr",c(1,1,1,1,2,2)), - position=as.integer(c(3,5,65,343,23,456)), - genetic_dist = as.double(rep(0,6)), - allele_ref = c("A","T","C","G","C","T"), - allele_alt = c("T","C", NA,"C","G","A")) - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = tempfile()) - - # get the gt filenames - files <- gt_get_file_names(test_gt) - # remove extension - file <- gsub(".bk","",files[2]) - # create gt using the same backingfile name - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = file) - - # get new file names - new_files <- gt_get_file_names(test_gt) - - # mess with the loci table - test_gt <- test_gt %>% select_loci(c(2,4,5,1,6)) - test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) - gt_save(test_gt, quiet = TRUE) - - # check file names - expect_equal(gt_get_file_names(test_gt)[1], paste0(file,"_v3.rds")) - expect_equal(gt_get_file_names(test_gt)[2], paste0(file,"_v3.bk")) -}) - - test_that("chr_int is always an integer",{ # matrix method diff --git a/tests/testthat/test_gen_tibble_versioning.R b/tests/testthat/test_gen_tibble_versioning.R new file mode 100644 index 00000000..08e72eac --- /dev/null +++ b/tests/testthat/test_gen_tibble_versioning.R @@ -0,0 +1,88 @@ + +test_that("versioning if .bk already exists",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = tempfile()) + + # get the gt filenames + files <- gt_get_file_names(test_gt) + # remove the .rds + expect_true(file.remove(files[1])) + + file <- gsub(".bk","",files[2]) + + # create gt using the same backingfile name + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = file) + + # get new file names + new_files <- gt_get_file_names(test_gt) + + # new_files has the same name as original file, plus a version extension + expect_equal(new_files[2], paste0(file,"_v2.bk")) + + # repeating the process creates another version + expect_true(file.remove(new_files[1])) + expect_false(file.exists(new_files[1])) + expect_true(file.exists(new_files[2])) + + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = file) + + new_version_files <- gt_get_file_names(test_gt) + + expect_equal(new_version_files[2], paste0(file,"_v3.bk")) + +}) + +test_that("versioning updates correctly through gt_order_loci",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,23,456)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = tempfile()) + + # get the gt filenames + files <- gt_get_file_names(test_gt) + # remove extension + file <- gsub(".bk","",files[2]) + # create gt using the same backingfile name + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = file) + + # get new file names + new_files <- gt_get_file_names(test_gt) + + # mess with the loci table + test_gt <- test_gt %>% select_loci(c(2,4,5,1,6)) + test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) + gt_save(test_gt, quiet = TRUE) + + # check file names + expect_equal(gt_get_file_names(test_gt)[1], paste0(file,"_v3.rds")) + expect_equal(gt_get_file_names(test_gt)[2], paste0(file,"_v3.bk")) +}) + From f642a8beccdb1148e2732e78239fa2f7a5bc0758 Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 13:12:16 +0000 Subject: [PATCH 09/16] check directory exists, debugging assert_dir error --- tests/testthat/test_gen_tibble_versioning.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testthat/test_gen_tibble_versioning.R b/tests/testthat/test_gen_tibble_versioning.R index 08e72eac..c717eeee 100644 --- a/tests/testthat/test_gen_tibble_versioning.R +++ b/tests/testthat/test_gen_tibble_versioning.R @@ -22,6 +22,10 @@ test_that("versioning if .bk already exists",{ file <- gsub(".bk","",files[2]) + # check the directory + tempfile_dir <- dirname(file) + expect_true(dir.exists(tempfile_dir)) + # create gt using the same backingfile name test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, From 8f1f94bae31aded82ff6a36ff1059092581f0b0c Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 13:28:32 +0000 Subject: [PATCH 10/16] test error by switching order of tests --- tests/testthat/test_gen_tibble_versioning.R | 69 ++++++++++----------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/tests/testthat/test_gen_tibble_versioning.R b/tests/testthat/test_gen_tibble_versioning.R index c717eeee..85c9c6f4 100644 --- a/tests/testthat/test_gen_tibble_versioning.R +++ b/tests/testthat/test_gen_tibble_versioning.R @@ -1,5 +1,4 @@ - -test_that("versioning if .bk already exists",{ +test_that("versioning updates correctly through gt_order_loci",{ test_indiv_meta <- data.frame (id=c("a","b","c"), population = c("pop1","pop1","pop2")) test_genotypes <- rbind(c(1,1,0,1,1,0), @@ -17,15 +16,8 @@ test_that("versioning if .bk already exists",{ # get the gt filenames files <- gt_get_file_names(test_gt) - # remove the .rds - expect_true(file.remove(files[1])) - + # remove extension file <- gsub(".bk","",files[2]) - - # check the directory - tempfile_dir <- dirname(file) - expect_true(dir.exists(tempfile_dir)) - # create gt using the same backingfile name test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, @@ -34,25 +26,18 @@ test_that("versioning if .bk already exists",{ # get new file names new_files <- gt_get_file_names(test_gt) - # new_files has the same name as original file, plus a version extension - expect_equal(new_files[2], paste0(file,"_v2.bk")) - - # repeating the process creates another version - expect_true(file.remove(new_files[1])) - expect_false(file.exists(new_files[1])) - expect_true(file.exists(new_files[2])) - - test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, - indiv_meta = test_indiv_meta, quiet = TRUE, - backingfile = file) - - new_version_files <- gt_get_file_names(test_gt) - - expect_equal(new_version_files[2], paste0(file,"_v3.bk")) + # mess with the loci table + test_gt <- test_gt %>% select_loci(c(2,4,5,1,6)) + test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) + gt_save(test_gt, quiet = TRUE) + # check file names + expect_equal(gt_get_file_names(test_gt)[1], paste0(file,"_v3.rds")) + expect_equal(gt_get_file_names(test_gt)[2], paste0(file,"_v3.bk")) }) -test_that("versioning updates correctly through gt_order_loci",{ + +test_that("versioning if .bk already exists",{ test_indiv_meta <- data.frame (id=c("a","b","c"), population = c("pop1","pop1","pop2")) test_genotypes <- rbind(c(1,1,0,1,1,0), @@ -70,8 +55,15 @@ test_that("versioning updates correctly through gt_order_loci",{ # get the gt filenames files <- gt_get_file_names(test_gt) - # remove extension + # remove the .rds + expect_true(file.remove(files[1])) + file <- gsub(".bk","",files[2]) + + # check the directory + tempfile_dir <- dirname(file) + expect_true(dir.exists(tempfile_dir)) + # create gt using the same backingfile name test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, @@ -80,13 +72,20 @@ test_that("versioning updates correctly through gt_order_loci",{ # get new file names new_files <- gt_get_file_names(test_gt) - # mess with the loci table - test_gt <- test_gt %>% select_loci(c(2,4,5,1,6)) - test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) - gt_save(test_gt, quiet = TRUE) + # new_files has the same name as original file, plus a version extension + expect_equal(new_files[2], paste0(file,"_v2.bk")) - # check file names - expect_equal(gt_get_file_names(test_gt)[1], paste0(file,"_v3.rds")) - expect_equal(gt_get_file_names(test_gt)[2], paste0(file,"_v3.bk")) -}) + # repeating the process creates another version + expect_true(file.remove(new_files[1])) + expect_false(file.exists(new_files[1])) + expect_true(file.exists(new_files[2])) + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, + indiv_meta = test_indiv_meta, quiet = TRUE, + backingfile = file) + + new_version_files <- gt_get_file_names(test_gt) + + expect_equal(new_version_files[2], paste0(file,"_v3.bk")) + +}) From 0b16a822a83bd6496eb9cda688fa88b121f81e6f Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 13:40:57 +0000 Subject: [PATCH 11/16] debug --- tests/testthat/test_gen_tibble_versioning.R | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test_gen_tibble_versioning.R b/tests/testthat/test_gen_tibble_versioning.R index 85c9c6f4..0854d101 100644 --- a/tests/testthat/test_gen_tibble_versioning.R +++ b/tests/testthat/test_gen_tibble_versioning.R @@ -16,8 +16,19 @@ test_that("versioning updates correctly through gt_order_loci",{ # get the gt filenames files <- gt_get_file_names(test_gt) + + # do the files exist? + expect_true(file.exists(files[1])) + expect_true(file.exists(files[2])) + # remove extension file <- gsub(".bk","",files[2]) + + # does the directory still exist? + print(dirname(file)) + tempfile_dir <- dirname(file) + expect_true(dir.exists(tempfile_dir)) + # create gt using the same backingfile name test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, @@ -55,12 +66,19 @@ test_that("versioning if .bk already exists",{ # get the gt filenames files <- gt_get_file_names(test_gt) + + # do the files exist? + expect_true(file.exists(files[1])) + expect_true(file.exists(files[2])) + # remove the .rds expect_true(file.remove(files[1])) + # remove extension file <- gsub(".bk","",files[2]) - # check the directory + # does the directory still exist? + print(dirname(file)) tempfile_dir <- dirname(file) expect_true(dir.exists(tempfile_dir)) From 86a0632187f11da70d28522350c30c3439450a8c Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 15:45:51 +0000 Subject: [PATCH 12/16] new tests of chr_int and ordering --- tests/testthat/test_gt_order_loci.R | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/testthat/test_gt_order_loci.R b/tests/testthat/test_gt_order_loci.R index fe12fa93..1289f9fc 100644 --- a/tests/testthat/test_gt_order_loci.R +++ b/tests/testthat/test_gt_order_loci.R @@ -166,3 +166,59 @@ test_that("gt_order_loci use_current_table = TRUE",{ expect_error(gt_order_loci(test_gt, use_current_table = TRUE, quiet = TRUE),"Your loci are not sorted within chromosomes") }) +test_that("chr_int from character chromosomes",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=as.character(paste0("chr",c(2,13,8,1,2,3))), + position=as.integer(c(23,3,5,65,343,46)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, backingfile = tempfile()) + + # check chr_int makes sense + expect_equal(show_loci(test_gt)$chr_int, c(2,13,8,1,2,3)) + + # use gt_order_loci to reorder + test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) + expect_equal(show_loci(test_gt)$chr_int, c(1,13,2,2,3,8)) + expect_equal(show_loci(test_gt)$chromosome, c("chr1","chr2","chr2","chr3","chr8","chr13")) +}) + + +test_that("chr_int from factor chromosomes",{ + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=as.factor(c(2,13,8,1,2,3)), + position=as.integer(c(23,3,5,65,343,46)), + genetic_dist = as.double(rep(0,6)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, backingfile = tempfile()) + + # check chr_int makes sense + expect_equal(show_loci(test_gt)$chr_int, c(2,13,8,1,2,3)) + + # use gt_order_loci to reorder + test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) + expect_equal(show_loci(test_gt)$chr_int, c(1,2,2,3,8,13)) + expect_equal(show_loci(test_gt)$chromosome, c(1,2,2,3,8,13)) +}) + + + + + + + + From a6c892d7a4b5b8230787c014e427a3afffcf669e Mon Sep 17 00:00:00 2001 From: eviecarter33 Date: Fri, 13 Dec 2024 16:32:25 +0000 Subject: [PATCH 13/16] fix test --- tests/testthat/test_gt_order_loci.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_gt_order_loci.R b/tests/testthat/test_gt_order_loci.R index 1289f9fc..dc0c9f82 100644 --- a/tests/testthat/test_gt_order_loci.R +++ b/tests/testthat/test_gt_order_loci.R @@ -173,7 +173,7 @@ test_that("chr_int from character chromosomes",{ c(2,1,0,0,0,0), c(2,2,0,0,1,1)) test_loci <- data.frame(name=paste0("rs",1:6), - chromosome=as.character(paste0("chr",c(2,13,8,1,2,3))), + chromosome=as.character(c(2,13,8,1,2,3)), position=as.integer(c(23,3,5,65,343,46)), genetic_dist = as.double(rep(0,6)), allele_ref = c("A","T","C","G","C","T"), @@ -186,8 +186,8 @@ test_that("chr_int from character chromosomes",{ # use gt_order_loci to reorder test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) - expect_equal(show_loci(test_gt)$chr_int, c(1,13,2,2,3,8)) - expect_equal(show_loci(test_gt)$chromosome, c("chr1","chr2","chr2","chr3","chr8","chr13")) + expect_equal(show_loci(test_gt)$chr_int, c(1,2,2,3,8,13)) + expect_equal(show_loci(test_gt)$chromosome, c("1","2","2","3","8","13")) }) From 29c03c41698cc077e936360ef4603cac5f9fbf59 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 13 Dec 2024 16:39:37 +0000 Subject: [PATCH 14/16] fix chr_as_int --- R/gen_tibble.R | 2 ++ tests/testthat/test_gt_order_loci.R | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index 5068655b..ee5a2beb 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -563,6 +563,8 @@ cast_chromosome_to_int <- function(chromosome){ } # if chromosome is a character, then cast it to integer if (is.character(chromosome)){ + # attempt to strip chr from the chromosome + chromosome <- gsub("^(chromosome_|chr_|chromosome|chr)", "", chromosome, ignore.case = TRUE) chromosome <- tryCatch(as.integer(chromosome), warning = function(e) {as.integer(as.factor(chromosome))}) } if (is.numeric(chromosome)){ diff --git a/tests/testthat/test_gt_order_loci.R b/tests/testthat/test_gt_order_loci.R index dc0c9f82..37045067 100644 --- a/tests/testthat/test_gt_order_loci.R +++ b/tests/testthat/test_gt_order_loci.R @@ -212,7 +212,7 @@ test_that("chr_int from factor chromosomes",{ # use gt_order_loci to reorder test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) expect_equal(show_loci(test_gt)$chr_int, c(1,2,2,3,8,13)) - expect_equal(show_loci(test_gt)$chromosome, c(1,2,2,3,8,13)) + expect_equal(as.character(show_loci(test_gt)$chromosome),as.character(c(1,2,2,3,8,13))) }) From 2ed883a7a315620ff8d0b4b4f14d3174b9ce3813 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Fri, 13 Dec 2024 17:30:57 +0000 Subject: [PATCH 15/16] handle physical pos when reordering --- R/gen_tibble.R | 4 ---- R/gt_order_loci.R | 9 ++++++--- R/gt_update_backingfile.R | 17 ++++++++++++++++- R/is_loci_table_ordered.R | 24 ++++++++++++++++++++---- man/gt_order_loci.Rd | 12 +++++++++++- man/gt_update_backingfile.Rd | 12 +++++++++++- man/is_loci_table_ordered.Rd | 23 ++++++++++++++++++++--- tests/testthat/test_gen_tibble.R | 2 +- tests/testthat/test_gt_order_loci.R | 28 ++++++++++++++++++++++++++++ 9 files changed, 113 insertions(+), 18 deletions(-) diff --git a/R/gen_tibble.R b/R/gen_tibble.R index ee5a2beb..0d6abcb2 100644 --- a/R/gen_tibble.R +++ b/R/gen_tibble.R @@ -288,9 +288,6 @@ gen_tibble.matrix <- function(x, indiv_meta, loci, ..., backingfile <- change_duplicated_file_name(backingfile) } - # use code for NA in FBM.256 -# x[is.na(x)]<-3 - bigsnp_obj <- gt_write_bigsnp_from_dfs(genotypes = x, indiv_meta = indiv_meta, loci = loci, @@ -383,7 +380,6 @@ gt_write_bigsnp_from_dfs <- function(genotypes, indiv_meta, loci, sex = 0, affection = 0, ploidy = ploidy) - map <- tibble(chromosome = loci$chromosome, marker.ID = loci$name, genetic.dist = as.double(loci$genetic_dist), ## make sure that genetic.dist is double diff --git a/R/gt_order_loci.R b/R/gt_order_loci.R index 02d03fe8..e4c8e8c2 100644 --- a/R/gt_order_loci.R +++ b/R/gt_order_loci.R @@ -11,12 +11,15 @@ #' reordered; if TRUE, then the current loci table, which might have been reordered #' manually, will be used, but only if the positions within each chromosome are #' sequential +#' @param ignore_genetic_dist boolean to ignore the genetic distance when checking. Note +#' that, if `gentic_dist` are being ignored and they are not sorted, the function will +#' set them to zero to avoid problems with other software. #' @param quiet boolean to suppress information about the files #' @param ... other arguments #' @return A [gen_tibble] #' @export -gt_order_loci <- function(.x, use_current_table = FALSE, quiet = FALSE, ...){ +gt_order_loci <- function(.x, use_current_table = FALSE, ignore_genetic_dist = TRUE, quiet = FALSE, ...){ if (use_current_table){ new_table <- show_loci(.x) } else { @@ -24,7 +27,7 @@ gt_order_loci <- function(.x, use_current_table = FALSE, quiet = FALSE, ...){ show_loci(.x) <- new_table } # if asked to use the current table, check that it is ordered - is_loci_table_ordered(.x, error_on_false = TRUE) - gt_update_backingfile(.x, quiet=quiet, ...) + is_loci_table_ordered(.x, error_on_false = TRUE, ignore_genetic_dist = ignore_genetic_dist) + gt_update_backingfile(.x, quiet=quiet, ignore_genetic_dist = ignore_genetic_dist, ...) } diff --git a/R/gt_update_backingfile.R b/R/gt_update_backingfile.R index 91d3bfa2..2e454bff 100644 --- a/R/gt_update_backingfile.R +++ b/R/gt_update_backingfile.R @@ -9,13 +9,16 @@ #' for backing files used to store the data (they will be given a .bk #' and .RDS automatically). If left to NULL (the default), the file name #' will be based on the name f the current backing file. +#' @param ignore_genetic_dist boolean to ignore the genetic distance when checking. Note +#' that, if `gentic_dist` are being ignored and they are not sorted, the function will +#' set them to zero to avoid problems with other software. #' @param chunk_size the number of loci to process at once #' @param quiet boolean to suppress information about the files #' @returns a [`gen_tibble`] with a backing file (i.e. a new File Backed Matrix) #' @export gt_update_backingfile <- function (.x, backingfile = NULL, chunk_size = NULL, - quiet = FALSE){ + ignore_genetic_dist = TRUE, quiet = FALSE){ # if the backingfile is null, create a name based on the current backing file if (is.null(backingfile)){ backingfile <- change_duplicated_file_name(gt_get_file_names(.x)[2]) @@ -69,6 +72,18 @@ gt_update_backingfile <- function (.x, backingfile = NULL, chunk_size = NULL, # same for map map <- attr(.x$genotypes,"bigsnp")$map map <- map[.gt_bigsnp_cols(.x),] + # if we ignore genetic distance, set it to zero if it is not sorted + if (ignore_genetic_dist){ + if (any(unlist(show_loci(.x) %>% + group_by(.data$chr_int) %>% + group_map(~ is.unsorted(.x$genetic_dist))))){ + if (!quiet){ + message("Genetic distances are not sorted, setting them to zero") + } + show_loci(.x)$genetic_dist <- 0 + } + } + # Create the bigSNP object bigsnp_obj <- structure(list(genotypes = new_bk_matrix, fam = fam, map = map), diff --git a/R/is_loci_table_ordered.R b/R/is_loci_table_ordered.R index ad48caa7..c1480ec1 100644 --- a/R/is_loci_table_ordered.R +++ b/R/is_loci_table_ordered.R @@ -8,26 +8,28 @@ #' or a [`gen_tibble`]. #' @param error_on_false logical, if `TRUE` an error is thrown if the loci #' are not ordered. +#' @param ignore_physical logical, if `TRUE` the physical position is not checked. #' @param ... other arguments passed to specific methods. #' @returns a logical vector defining which loci are transversions #' @rdname is_loci_table_ordered #' @export -is_loci_table_ordered <- function(.x, error_on_false = FALSE, ...) { +is_loci_table_ordered <- function(.x, error_on_false = FALSE, ignore_genetic_dist = TRUE, ...) { UseMethod("is_loci_table_ordered", .x) } #' @export #' @rdname is_loci_table_ordered -is_loci_table_ordered.tbl_df <- function(.x, error_on_false = FALSE, ...) { +is_loci_table_ordered.tbl_df <- function(.x, error_on_false = FALSE, ignore_genetic_dist = TRUE, ...) { #TODO this is a hack to deal with the class being dropped when going through group_map stopifnot_gen_tibble(.x) - is_loci_table_ordered(.x$genotypes, error_on_false, ...) + is_loci_table_ordered(.x$genotypes, error_on_false = error_on_false, + ignore_genetic_dist = ignore_genetic_dist, ...) } #' @export #' @rdname is_loci_table_ordered -is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ...) { +is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ignore_genetic_dist = TRUE, ...) { rlang::check_dots_empty() # check that within each chromosome positions are sorted @@ -49,6 +51,20 @@ is_loci_table_ordered.vctrs_bigSNP <- function(.x, error_on_false = FALSE, ...) return(FALSE) } } + + # check genetic distance + if (!ignore_genetic_dist){ + if (any(unlist(show_loci(.x) %>% + group_by(.data$chr_int) %>% + group_map(~ is.unsorted(.x$genetic_dist))))){ + if (error_on_false){ + stop("Your genetic distances are not sorted within chromosomes") + } else { + return(FALSE) + } + } + } + return(TRUE) } diff --git a/man/gt_order_loci.Rd b/man/gt_order_loci.Rd index 40fc4820..b12d8ff8 100644 --- a/man/gt_order_loci.Rd +++ b/man/gt_order_loci.Rd @@ -4,7 +4,13 @@ \alias{gt_order_loci} \title{Order the loci table of a gen_tibble} \usage{ -gt_order_loci(.x, use_current_table = FALSE, quiet = FALSE, ...) +gt_order_loci( + .x, + use_current_table = FALSE, + ignore_genetic_dist = TRUE, + quiet = FALSE, + ... +) } \arguments{ \item{.x}{a \code{\link{gen_tibble}}} @@ -14,6 +20,10 @@ reordered; if TRUE, then the current loci table, which might have been reordered manually, will be used, but only if the positions within each chromosome are sequential} +\item{ignore_genetic_dist}{boolean to ignore the genetic distance when checking. Note +that, if \code{gentic_dist} are being ignored and they are not sorted, the function will +set them to zero to avoid problems with other software.} + \item{quiet}{boolean to suppress information about the files} \item{...}{other arguments} diff --git a/man/gt_update_backingfile.Rd b/man/gt_update_backingfile.Rd index 4d5bfe92..06bc09d7 100644 --- a/man/gt_update_backingfile.Rd +++ b/man/gt_update_backingfile.Rd @@ -4,7 +4,13 @@ \alias{gt_update_backingfile} \title{Update the backing matrix} \usage{ -gt_update_backingfile(.x, backingfile = NULL, chunk_size = NULL, quiet = FALSE) +gt_update_backingfile( + .x, + backingfile = NULL, + chunk_size = NULL, + ignore_genetic_dist = TRUE, + quiet = FALSE +) } \arguments{ \item{.x}{a \code{gen_tibble} object} @@ -16,6 +22,10 @@ will be based on the name f the current backing file.} \item{chunk_size}{the number of loci to process at once} +\item{ignore_genetic_dist}{boolean to ignore the genetic distance when checking. Note +that, if \code{gentic_dist} are being ignored and they are not sorted, the function will +set them to zero to avoid problems with other software.} + \item{quiet}{boolean to suppress information about the files} } \value{ diff --git a/man/is_loci_table_ordered.Rd b/man/is_loci_table_ordered.Rd index b37b9383..e536e58e 100644 --- a/man/is_loci_table_ordered.Rd +++ b/man/is_loci_table_ordered.Rd @@ -6,11 +6,26 @@ \alias{is_loci_table_ordered.vctrs_bigSNP} \title{Test if the loci table is ordered} \usage{ -is_loci_table_ordered(.x, error_on_false = FALSE, ...) +is_loci_table_ordered( + .x, + error_on_false = FALSE, + ignore_genetic_dist = TRUE, + ... +) -\method{is_loci_table_ordered}{tbl_df}(.x, error_on_false = FALSE, ...) +\method{is_loci_table_ordered}{tbl_df}( + .x, + error_on_false = FALSE, + ignore_genetic_dist = TRUE, + ... +) -\method{is_loci_table_ordered}{vctrs_bigSNP}(.x, error_on_false = FALSE, ...) +\method{is_loci_table_ordered}{vctrs_bigSNP}( + .x, + error_on_false = FALSE, + ignore_genetic_dist = TRUE, + ... +) } \arguments{ \item{.x}{a vector of class \code{vctrs_bigSNP} (usually the \code{genotype} column of @@ -21,6 +36,8 @@ or a \code{\link{gen_tibble}}.} are not ordered.} \item{...}{other arguments passed to specific methods.} + +\item{ignore_physical}{logical, if \code{TRUE} the physical position is not checked.} } \value{ a logical vector defining which loci are transversions diff --git a/tests/testthat/test_gen_tibble.R b/tests/testthat/test_gen_tibble.R index ea33beb6..74fadb9c 100644 --- a/tests/testthat/test_gen_tibble.R +++ b/tests/testthat/test_gen_tibble.R @@ -597,7 +597,7 @@ test_that("chr_int is correct",{ chromosome_names <- c("1", "2", NA, "4") expect_true(identical(c(1L,2L,NA,4L), cast_chromosome_to_int(chromosome_names))) chromosome_names <- c("chr1", "chr2", NA, "chr4") - expect_true(identical(c(1L,2L,NA,3L), cast_chromosome_to_int(chromosome_names))) + expect_true(identical(c(1L,2L,NA,4L), cast_chromosome_to_int(chromosome_names))) chromosome_names <- c("a", "b", NA, "c") expect_true(identical(c(1L,2L,NA,3L), cast_chromosome_to_int(chromosome_names))) diff --git a/tests/testthat/test_gt_order_loci.R b/tests/testthat/test_gt_order_loci.R index 37045067..80601e6d 100644 --- a/tests/testthat/test_gt_order_loci.R +++ b/tests/testthat/test_gt_order_loci.R @@ -166,6 +166,34 @@ test_that("gt_order_loci use_current_table = TRUE",{ expect_error(gt_order_loci(test_gt, use_current_table = TRUE, quiet = TRUE),"Your loci are not sorted within chromosomes") }) +test_that("gt_order_loci catches physical positions out of sync",{ + + test_indiv_meta <- data.frame (id=c("a","b","c"), + population = c("pop1","pop1","pop2")) + test_genotypes <- rbind(c(1,1,0,1,1,0), + c(2,1,0,0,0,0), + c(2,2,0,0,1,1)) + test_loci <- data.frame(name=paste0("rs",1:6), + chromosome=paste0("chr",c(1,1,1,1,2,2)), + position=as.integer(c(3,5,65,343,27,83)), + genetic_dist = as.double(c(0.3,0.7,0,0.2,0.0,0.3)), + allele_ref = c("A","T","C","G","C","T"), + allele_alt = c("T","C", NA,"C","G","A")) + + path <- tempfile() + test_gt <- gen_tibble(x = test_genotypes, loci = test_loci, indiv_meta = test_indiv_meta, quiet = TRUE, backingfile = path) + + expect_true(is_loci_table_ordered(test_gt)) + expect_false(is_loci_table_ordered(test_gt, ignore_genetic_dist = FALSE)) + # now order it + ordered_test_gt <- gt_order_loci(test_gt, use_current_table = FALSE, quiet = TRUE) + # now we pass the test as we set genetic distances to zero + expect_true(is_loci_table_ordered(ordered_test_gt, ignore_genetic_dist = FALSE)) + # check that genetic_dist is all zeroes + expect_equal(show_loci(ordered_test_gt)$genetic_dist, rep(0,6)) +}) + + test_that("chr_int from character chromosomes",{ test_indiv_meta <- data.frame (id=c("a","b","c"), population = c("pop1","pop1","pop2")) From be821a616327a2bc289ed987cffeba3cb67b1353 Mon Sep 17 00:00:00 2001 From: Andrea Manica Date: Mon, 16 Dec 2024 08:01:00 +0000 Subject: [PATCH 16/16] fix docs --- R/is_loci_table_ordered.R | 2 +- man/is_loci_table_ordered.Rd | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/is_loci_table_ordered.R b/R/is_loci_table_ordered.R index c1480ec1..75ef28c3 100644 --- a/R/is_loci_table_ordered.R +++ b/R/is_loci_table_ordered.R @@ -8,7 +8,7 @@ #' or a [`gen_tibble`]. #' @param error_on_false logical, if `TRUE` an error is thrown if the loci #' are not ordered. -#' @param ignore_physical logical, if `TRUE` the physical position is not checked. +#' @param ignore_genetic_dist logical, if `TRUE` the physical position is not checked. #' @param ... other arguments passed to specific methods. #' @returns a logical vector defining which loci are transversions #' @rdname is_loci_table_ordered diff --git a/man/is_loci_table_ordered.Rd b/man/is_loci_table_ordered.Rd index e536e58e..7b7dee04 100644 --- a/man/is_loci_table_ordered.Rd +++ b/man/is_loci_table_ordered.Rd @@ -35,9 +35,9 @@ or a \code{\link{gen_tibble}}.} \item{error_on_false}{logical, if \code{TRUE} an error is thrown if the loci are not ordered.} -\item{...}{other arguments passed to specific methods.} +\item{ignore_genetic_dist}{logical, if \code{TRUE} the physical position is not checked.} -\item{ignore_physical}{logical, if \code{TRUE} the physical position is not checked.} +\item{...}{other arguments passed to specific methods.} } \value{ a logical vector defining which loci are transversions