Skip to content

Commit

Permalink
roh
Browse files Browse the repository at this point in the history
  • Loading branch information
dramanica committed Apr 19, 2024
1 parent 501b72c commit ecae955
Show file tree
Hide file tree
Showing 10 changed files with 415 additions and 100 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Imports:
Suggests:
adegenet,
knitr,
detectRUNS,
LEA,
rmarkdown,
readr,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ export(gt_pca_clust_best_k)
export(gt_pca_find_clusters)
export(gt_pca_partialSVD)
export(gt_pca_randomSVD)
export(gt_roh_window)
export(gt_save)
export(gt_set_imputed)
export(gt_uses_imputed)
Expand Down
104 changes: 104 additions & 0 deletions R/gt_roh_window.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#' Detect runs of homozygosity using a sliding-window approach
#'
#' This function uses a sliding-window approach to look for runs of homozygosity (or
#' heterozygosity) in a diploid genome. This function uses the package `selectRUNS`,
#' which implements an approach equivalent to the one in PLINK.
#'
#' @param x a [gen_tibble]
#' @param window_size the size of sliding window (number of SNP loci) (default = 15)
#' @param threshold the threshold of overlapping windows of the same state
#' (homozygous/heterozygous) to call a SNP in a RUN (default = 0.05)
#' @param min_snp minimum n. of SNP in a RUN (default = 3)
#' @param heterozygosity should we look for runs of heterozygosity (instead of homozygosity? (default = FALSE)
#' @param max_opp_window max n. of SNPs of the opposite type (e.g. heterozygous
#' snps for runs of homozygosity) in the
#' sliding window (default = 1)
#' @param max_miss_window max. n. of missing SNP in the sliding window (default = 1)
#' @param max_gap max distance between consecutive SNP to be still considered a
#' potential run (default = 10^6 bps)
#' @param min_length_bps minimum length of run in bps (defaults to 1000 bps = 1 kbps)
#' @param min_density minimum n. of SNP per kbps (defaults to 0.1 = 1 SNP every 10 kbps)
#' @param max_opp_run max n. of opposite genotype SNPs in the run (optional)
#' @param max_miss_run max n. of missing SNPs in the run (optional)
#'
#' @details
#' This function returns a data frame with all runs detected in the dataset.
#' This data frame can then be written out to a csv file.
#' The data frame is, in turn, the input for other functions of the detectRUNS package
#' that create plots and produce statistics from the results
#' (see plots and statistics functions in this manual,
#' and/or refer to the detectRUNS vignette).
#'
#' If the [`gen_tibble`] is grouped, then the grouping variable is used to fill in the
#' group table. Otherwise, the group 'column' is filled with the same values as the 'id'
#' column
#'
#' @return A dataframe with RUNs of Homozygosity or Heterozygosity in the analysed dataset.
#' The returned dataframe contains the following seven columns: "group", "id", "chrom",
#' "nSNP", "from", "to", "lengthBps" (group: population, breed, case/control etc.;
#' id: individual identifier; chrom: chromosome on which the run is located;
#' nSNP: number of SNPs in the run; from: starting position of the run, in bps;
#' to: end position of the run, in bps; lengthBps: size of the run)
#' @export
#'
#' @examples
#' # don't run the example
#' if (FALSE) {
#' sheep_bed <- system.file("extdata/sheep.bed", package="tidypopgen")
#' sheep_gt <- tidypopgen::gen_tibble(sheep_bed, backingfile = tempfile(), quiet=TRUE)
#' sheep_gt <- sheep_gt %>% group_by(population)
#' sheep_roh <- gt_roh_window(sheep_gt)
#' detectRUNS::plot_Runs(runs = sheep_roh)
#' }


gt_roh_window <- function(x, window_size = 15, threshold = 0.05,
min_snp = 3, heterozygosity = FALSE, max_opp_window = 1, max_miss_window = 1,
max_gap = 10^6, min_length_bps = 1000, min_density = 1/1000,
max_opp_run = NULL, max_miss_run = NULL) {
if (!requireNamespace("detectRUNS", quietly = TRUE)) {
stop(
"to use this function, first install package 'detectRUNS' with\n",
"install.packages('detectRUNS')")
}
# collect all parameters in a variable
parameters <- list(windowSize=window_size, threshold=threshold, minSNP=min_snp,
ROHet=heterozygosity, maxOppWindow=max_opp_window,
maxMissWindow=max_miss_window, maxGap=max_gap, minLengthBps=min_length_bps,
minDensity=min_density, maxOppRun=max_opp_run, maxMissRun=max_miss_run)

# create a map object
map <- show_loci(x) %>% dplyr::select(dplyr::all_of(c("chromosome","name","position"))) %>%
dplyr::rename("Chrom"="chromosome", "SNP"="name", "bps"="position") %>% as.data.frame()

# compute the gaps between snps
gaps <- diff(map$bps)
# use groups (if defined)
if (dplyr::is_grouped_df(x)){
groups <- x %>% select(dplyr::group_vars(x)) %>% dplyr::pull(1)
} else {
groups <- x$id
}
# initialize data.frame of results
runs_df <- data.frame(group=character(), id=character(), chrom=character(), nSNP=integer(),
from=integer(), to=integer(), lengthBps=integer())
# naively process it by row (the parallelism is implemented within individual)
# access time is horrible, but I don't think this is the bottleneck
# it needs some profiling
X <- gt_get_bigsnp(x)$genotypes # pointer for the FBM
for (i in seq_len(nrow(x))){
this_genotype <- X[i,]
this_indiv <- list(FID=groups[i], IID=x$id[i])
# find runs for this individual
this_runs <- utils::getFromNamespace("slidingRuns", "detectRUNS")(this_genotype, this_indiv, map, gaps, parameters)
# bind this run (if has rows) to others RUNs (if any)
runs_df <- rbind(runs_df, this_runs)
}

# fix row names
row.names(runs_df) <- NULL

# return calculated runs (data.frame)
return(runs_df)

}
100 changes: 0 additions & 100 deletions data-raw/datasets/sheep.fam

This file was deleted.

103 changes: 103 additions & 0 deletions data-raw/ideas/gt_roh.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#' Detect runs of homozygosity based on windows
#'
#' This function uses a sliding-window approach to look for runs of homozygosity (or
#' heterozygosity) in a diploid genome. This function uses the package `selectRUNS`,
#' which implements an approach equivalent to the one in PLINK.
#'
#' @param x a [gen_tibble]
#' @param window_size the size of sliding window (number of SNP loci) (default = 15)
#' @param threshold the threshold of overlapping windows of the same state
#' (homozygous/heterozygous) to call a SNP in a RUN (default = 0.05)
#' @param min_snp minimum n. of SNP in a RUN (default = 3)
#' @param heterozygosity should we look for runs of heterozygosity (instead of homozygosity? (default = FALSE)
#' @param max_opp_window max n. of SNPs of the opposite type (e.g. heterozygous
#' snps for runs of homozygosity) in the
#' sliding window (default = 1)
#' @param max_miss_window max. n. of missing SNP in the sliding window (default = 1)
#' @param max_gap max distance between consecutive SNP to be still considered a
#' potential run (default = 10^6 bps)
#' @param min_length_bps minimum length of run in bps (defaults to 1000 bps = 1 kbps)
#' @param min_density minimum n. of SNP per kbps (defaults to 0.1 = 1 SNP every 10 kbps)
#' @param max_opp_run max n. of opposite genotype SNPs in the run (optional)
#' @param max_miss_run max n. of missing SNPs in the run (optional)
#'
#' @details
#' This function returns a data frame with all runs detected in the dataset.
#' This data frame can then be written out to a csv file.
#' The data frame is, in turn, the input for other functions of the detectRUNS package
#' that create plots and produce statistics from the results
#' (see plots and statistics functions in this manual,
#' and/or refer to the detectRUNS vignette).
#'
#' If the [`gen_tibble`] is grouped, then the grouping variable is used to fill in the
#' group table. Otherwise, the group 'column' is filled with the same values as the 'id'
#' column
#'
#' @return A dataframe with RUNs of Homozygosity or Heterozygosity in the analysed dataset.
#' The returned dataframe contains the following seven columns: "group", "id", "chrom",
#' "nSNP", "from", "to", "lengthBps" (group: population, breed, case/control etc.;
#' id: individual identifier; chrom: chromosome on which the run is located;
#' nSNP: number of SNPs in the run; from: starting position of the run, in bps;
#' to: end position of the run, in bps; lengthBps: size of the run)
#' @export
#'
#' @examples
#' # don't run the example
#' if (FALSE) {
#' sheep_bed <- systems.file("extdata/sheep.bed", package="tidypopgen")
#' sheep_gt <- tidypopgen::gen_tibble(bed_path, backingfile = tempfile(), quiet=TRUE)
#' sheep_roh <- gt_roh(sheep_gt)
#' }


gt_roh <- function(x, window_size = 15, threshold = 0.05,
min_snp = 3, ROHet = FALSE, max_opp_window = 1, max_miss_window = 1,
max_gap = 10^6, min_length_bps = 1000, min_density = 1/1000,
max_opp_run = NULL, max_miss_run = NULL) {
if (!requireNamespace("detectRUNS", quietly = TRUE)) {
stop(
"to use this function, first install package 'detectRUNS' with\n",
"install.packages('detectRUNS')")
}
# collect all parameters in a variable
parameters <- list(windowSize=window_size, threshold=threshold, minSNP=min_snp,
ROHet=ROHet, maxOppWindow=max_opp_window,
maxMissWindow=max_miss_window, maxGap=max_gap, minLengthBps=min_length_bps,
minDensity=min_density, maxOppRun=max_opp_run, maxMissRun=max_miss_run)

# create a map object
map <- show_loci(x) %>% dplyr::select(dplyr::all_of(c("chromosome","name","position"))) %>%
dplyr::rename("Chrom"="chromosome", "SNP"="name", "bps"="position") %>% as.data.frame()

# compute the gaps between snps
gaps <- diff(map$bps)
# use groups (if defined)
if (dplyr::is_grouped_df(sheep_gt)){
groups <- x %>% dplyr::group_keys()
} else {
groups <- x$id
}

# initialize data.frame of results
runs_df <- data.frame(group=character(), id=character(), chrom=character(), nSNP=integer(),
from=integer(), to=integer(), lengthBps=integer())
# naively process it by row (the parallelism is implemented within individual)
# access time is horrible, but I don't think this is the bottleneck
# it needs some profiling
X <- gt_get_bigsnp(x)$genotypes # pointer for the FBM
for (i in seq_len(nrow(x))){
this_genotype <- X[i,]
this_indiv <- list(FID=groups[i], IID=x$id[i])
# find runs for this individual
this_runs <- utils::getFromNamespace("slidingRuns", "detectRUNS")(this_genotype, this_indiv, map, gaps, parameters)
# bind this run (if has rows) to others RUNs (if any)
runs_df <- rbind(runs_df, this_runs)
}

# fix row names
row.names(runs_df) <- NULL

# return calculated runs (data.frame)
return(runs_df)

}
21 changes: 21 additions & 0 deletions data-raw/ideas/sheep_for_roh.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
genotypeFilePath <- system.file(
"extdata", "Kijas2016_Sheep_subset.ped", package="detectRUNS")
mapFilePath <- system.file(
"extdata", "Kijas2016_Sheep_subset.map", package="detectRUNS")

sheep_ped <- snpStats::read.pedfile(genotypeFilePath)
map <- read.table(mapFilePath)
names(map)<-c("chromosome", "id", "genetic.distance","position")
sheep_ped$map<-cbind(sheep_ped$map,map)

snpStats::write.plink("./data-raw/datasets/sheep",snps = sheep_ped$genotypes,
pedigree = sheep_ped$fam$pedigree,
id = sheep_ped$fam$member,
sex = sheep_ped$fam$sex,
chromosome = sheep_ped$map$chromosome,
allele.1 = sheep_ped$map$allele.1,
allele.2 = sheep_ped$map$allele.2,
position = sheep_ped$map$position,
genetic.distance = sheep_ped$map$genetic.distance,
human.genome = FALSE
)
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit ecae955

Please sign in to comment.