From 0810ee5262d182c082eb96ab91591069c16dee7e Mon Sep 17 00:00:00 2001 From: Markus Riester Date: Fri, 13 Sep 2024 10:33:31 -0400 Subject: [PATCH] +findHighQualitySNPs --- DESCRIPTION | 4 +-- NAMESPACE | 1 + NEWS | 2 ++ R/calculateMappingBiasVcf.R | 47 +++++++++++++++++++++++++++++++++++ man/findHighQualitySNPs.Rd | 49 +++++++++++++++++++++++++++++++++++++ 5 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 man/findHighQualitySNPs.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 8d6df0a..062db43 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,8 +2,8 @@ Package: PureCN Type: Package Title: Copy number calling and SNV classification using targeted short read sequencing -Version: 2.11.1 -Date: 2024-05-10 +Version: 2.11.2 +Date: 2024-09-13 Authors@R: c(person("Markus", "Riester", role = c("aut", "cre"), email = "markus.riester@novartis.com", diff --git a/NAMESPACE b/NAMESPACE index cc7f1f3..1c9ee09 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(filterVcfBasic) export(filterVcfMuTect) export(filterVcfMuTect2) export(findFocal) +export(findHighQualitySNPs) export(getSexFromCoverage) export(getSexFromVcf) export(plotAbs) diff --git a/NEWS b/NEWS index db30669..d93d82d 100755 --- a/NEWS +++ b/NEWS @@ -7,6 +7,8 @@ NEW FEATURES Tuning parameter for coverage normalization. Default should in most cases be a good compromise between removing most normal noise and not overfitting to pool of normal samples. + o Added findHighQualitySNPs function to extract good SNPs from mapping bias + database. Changes in version 2.10.0 diff --git a/R/calculateMappingBiasVcf.R b/R/calculateMappingBiasVcf.R index df9e972..43bc327 100644 --- a/R/calculateMappingBiasVcf.R +++ b/R/calculateMappingBiasVcf.R @@ -443,3 +443,50 @@ calculateMappingBiasGatk4 <- function(workspace, reference.genome, gr$clustered[idxa] <- TRUE return(gr) } + +#' Find High Quality SNPs +#' +#' Function to extract high quality SNPs from the mapping bias database. +#' Useful for generating fingerprinting panels etc. +#' +#' +#' @param mapping.bias.file Generated by \code{\link{calculateMappingBiasVcf}}. +#' @param max.bias Maximum mapping bias +#' @param min.pon Minimum number of normal samples, useful to get reliable +#' mapping bias. +#' @param triallelic By default, ignore positions with multiple alt alleles. +#' @param vcf.file Optional VCF file (for example dbSNP). Needs to be +#' bgzip and tabix processed. +#' @param genome See \code{readVcf} +#' @return A \code{GRanges} object with mapping bias passing filters. +#' If \code{vcf.file} is provided, it will be the variants in the +#' corresponding file overlapping with the passed variants. +#' @author Markus Riester +#' @examples +#' +#' normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", +#' package = "PureCN") +#' bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19") +#' +#' @export findHighQualitySNPs +findHighQualitySNPs <- function(mapping.bias.file, max.bias = 0.2, min.pon = 2, + triallelic = FALSE, vcf.file = NULL, genome) { + if (is(mapping.bias.file, "GRanges")) { + bias <- mapping.bias.file + } else { + bias <- readRDS(mapping.bias.file) + } + bias <- bias[which(abs(bias$bias - 1) <= max.bias & (triallelic | !bias$triallelic) & bias$pon.count >= min.pon), ] + if (is.null(vcf.file)) return(bias) + vcfSeqStyle <- .getSeqlevelsStyle(headerTabix( + TabixFile(vcf.file))$seqnames) + + biasRenamedSL <- bias + if (!length(intersect(seqlevelsStyle(bias), vcfSeqStyle))) { + seqlevelsStyle(biasRenamedSL) <- vcfSeqStyle[1] + } + flog.info("Reading VCF...") + vcf <- readVcf(vcf.file, genome = genome, ScanVcfParam(which = reduce(biasRenamedSL))) + ov <- findOverlaps(biasRenamedSL, vcf, type = "equal") + return(vcf[subjectHits(ov)]) +} diff --git a/man/findHighQualitySNPs.Rd b/man/findHighQualitySNPs.Rd new file mode 100644 index 0000000..be5fff6 --- /dev/null +++ b/man/findHighQualitySNPs.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calculateMappingBiasVcf.R +\name{findHighQualitySNPs} +\alias{findHighQualitySNPs} +\title{Find High Quality SNPs} +\usage{ +findHighQualitySNPs( + mapping.bias.file, + max.bias = 0.2, + min.pon = 2, + triallelic = FALSE, + vcf.file = NULL, + genome +) +} +\arguments{ +\item{mapping.bias.file}{Generated by \code{\link{calculateMappingBiasVcf}}.} + +\item{max.bias}{Maximum mapping bias} + +\item{min.pon}{Minimum number of normal samples, useful to get reliable +mapping bias.} + +\item{triallelic}{By default, ignore positions with multiple alt alleles.} + +\item{vcf.file}{Optional VCF file (for example dbSNP). Needs to be +bgzip and tabix processed.} + +\item{genome}{See \code{readVcf}} +} +\value{ +A \code{GRanges} object with mapping bias passing filters. +If \code{vcf.file} is provided, it will be the variants in the +corresponding file overlapping with the passed variants. +} +\description{ +Function to extract high quality SNPs from the mapping bias database. +Useful for generating fingerprinting panels etc. +} +\examples{ + +normal.panel.vcf <- system.file("extdata", "normalpanel.vcf.gz", + package = "PureCN") +bias <- calculateMappingBiasVcf(normal.panel.vcf, genome = "h19") + +} +\author{ +Markus Riester +}