Skip to content

Commit

Permalink
Added dbr.per1k argument, updated dbr-related documentation as per #110
Browse files Browse the repository at this point in the history
  • Loading branch information
plger committed Sep 19, 2024
1 parent ba4e32e commit 3bbfa3f
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 43 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: scDblFinder
Type: Package
Title: scDblFinder
Version: 1.19.4
Version: 1.19.5
Authors@R: c(
person("Pierre-Luc", "Germain", email="[email protected]", role=c("cre","aut"), comment=c(ORCID="0000-0003-3418-4218")),
person("Aaron", "Lun", email="[email protected]", role="ctb"))
Expand Down
8 changes: 5 additions & 3 deletions R/doubletThresholding.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#' doublet rate will be adjusted for homotypic doublets.
#' @param dbr.sd The standard deviation of the doublet rate, representing the
#' uncertainty in the estimate. Ignored if `method!="optim"`.
#' @param dbr.per1k The expected proportion of doublets per 1000 cells.
#' @param stringency A numeric value >0 and <1 which controls the relative weight of false
#' positives (i.e. real cells) and false negatives (artificial doublets) in setting the
#' threshold. A value of 0.5 gives equal weight to both; a higher value (e.g. 0.7) gives
Expand Down Expand Up @@ -38,7 +39,8 @@
#'
#' @importFrom stats mad qnorm setNames
#' @export
doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1,
doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, dbr.per1k=0.008,
stringency=0.5, p=0.1,
method=c("auto","optim","dbr","griffiths"),
perSample=TRUE, returnType=c("threshold","call")){
method <- match.arg(method)
Expand Down Expand Up @@ -192,15 +194,15 @@ doubletThresholding <- function( d, dbr=NULL, dbr.sd=NULL, stringency=0.5, p=0.1
d
}

.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015 ){
.getDoubletStats <- function( d, th, dbr=NULL, dbr.sd=0.015, dbr.per1k=0.008 ){
# check that we have all necessary fields:
fields <- c("cluster","src","type","score","mostLikelyOrigin", "originAmbiguous","difficulty")
if(!all(fields %in% colnames(d))) stop("Input misses some columns.")
if(!is.null(d$sample))
return(dplyr::bind_rows(lapply(split(seq_len(nrow(d)), d$sample), FUN=function(i){
.getDoubletStats(d[i,fields], th, dbr=dbr, dbr.sd=dbr.sd)
}), .id="sample"))
if(is.null(dbr)) dbr <- 0.01*sum(d$src=="real",na.rm=TRUE)/1000
if(is.null(dbr)) dbr <- dbr.per1k*sum(d$src=="real",na.rm=TRUE)/1000
o <- d$mostLikelyOrigin[d$type=="real" & d$score>=th]
expected <- getExpectedDoublets(d$cluster[d$src=="real"], dbr=dbr)
stats <- .compareToExpectedDoublets(o, dbr=dbr, expected=expected)
Expand Down
10 changes: 6 additions & 4 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#' @param dbr The expected doublet rate.
#' @param only.heterotypic Logical; whether to return expectations only for
#' heterotypic doublets
#' @param dbr.per1k The expected proportion of doublets per 1000 cells.
#'
#' @return The expected number of doublets of each combination of clusters
#'
Expand All @@ -43,7 +44,8 @@
#' cl <- sample(head(LETTERS,4), size=2000, prob=c(.4,.2,.2,.2), replace=TRUE)
#' getExpectedDoublets(cl)
#' @export
getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE){
getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE,
dbr.per1k=0.008){
if(is(x,"SingleCellExperiment")){
clusters <- x$scDblFinder.clusters
}else{
Expand All @@ -54,7 +56,7 @@ getExpectedDoublets <- function(x, dbr=NULL, only.heterotypic=TRUE){
if(all(grepl("^[0-9]*$",lvls))) lvls <- as.integer(lvls)
clusters <- as.integer(clusters)
ncells <- length(clusters)
if(is.null(dbr)) dbr <- (0.01*ncells/1000)
if(is.null(dbr)) dbr <- (dbr.per1k*ncells/1000)
if(length(unique(clusters))==1) return(ncells*dbr)

cs <- table(clusters)/ncells
Expand Down Expand Up @@ -371,7 +373,7 @@ cxds2 <- function(x, whichDbls=c(), ntop=500, binThresh=NULL){
}

# gets a global doublet rate from samples' doublet rates
.gdbr <- function(d, dbr=NULL){
.gdbr <- function(d, dbr=NULL, dbr.per1k=0.008){
if(!is.null(dbr)){
if(length(dbr)==1) return(dbr)
stopifnot(!is.null(d$sample))
Expand All @@ -384,7 +386,7 @@ cxds2 <- function(x, whichDbls=c(), ntop=500, binThresh=NULL){
## estimate a global doublet rate
sl <- as.numeric(table(d$sample, d$src=="real")[,2])
}
dbr <- (0.01*sl/1000)
dbr <- (dbr.per1k*sl/1000)
sum(dbr*sl)/sum(sl)
}

Expand Down
57 changes: 32 additions & 25 deletions R/scDblFinder.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,21 @@
#' @param nfeatures The number of top features to use. Alternatively, a
#' character vectors of feature names (e.g. highly-variable genes) to use.
#' @param dims The number of dimensions used.
#' @param dbr The expected doublet rate. By default this is assumed to be 1\%
#' per thousand cells captured (so 4\% among 4000 thousand cells), which is
#' appropriate for 10x datasets. Corrections for homeotypic doublets will be
#' performed on the given rate.
#' @param dbr The expected doublet rate, i.e. the proportion of the cells
#' expected to be doublets. If omitted, will be calculated automatically based
#' on the `dbr.per1k` argument and the number of cells.
#' @param dbr.sd The uncertainty range in the doublet rate, interpreted as
#' a +/- around `dbr`. During thresholding, deviation from the expected doublet
#' rate will be calculated from these boundaries, and will be considered null
#' within these boundaries. If NULL, will be 40\% of `dbr`. Set to `dbr.sd=0` to
#' disable the uncertainty around the doublet rate, or to `dbr.sd=1` to disable
#' any expectation of the number of doublets (thus letting the thresholding be
#' entirely driven by the misclassification of artificial doublets).
#' @param dbr.per1k This is an alternative way of providing the expected doublet
#' rate as a fraction of the number of (the thousands of) cells captured. The
#' default, 0.008 (e.g. 3.2\% doublets among 4000 cells), is appropriate for
#' standard 10X chips. For High Throughput (HT) 10X chips, use half, i.e.
#' 0.004. (Some more recent chips might have this rate even lower).
#' @param k Number of nearest neighbors (for KNN graph). If more than one value
#' is given, the doublet density will be calculated at each k (and other values
#' at the highest k), and all the information will be used by the classifier.
Expand Down Expand Up @@ -195,8 +199,8 @@
scDblFinder <- function(
sce, clusters=NULL, samples=NULL, clustCor=NULL, artificialDoublets=NULL,
knownDoublets=NULL, knownUse=c("discard","positive"), dbr=NULL, dbr.sd=NULL,
nfeatures=1352, dims=20, k=NULL, removeUnidentifiable=TRUE, includePCs=19,
propRandom=0, propMarkers=0, aggregateFeatures=FALSE,
dbr.per1k=0.08, nfeatures=1352, dims=20, k=NULL, removeUnidentifiable=TRUE,
includePCs=19, propRandom=0, propMarkers=0, aggregateFeatures=FALSE,
returnType=c("sce","table","full","counts"),
score=c("xgb","weighted","ratio"), processing="default", metric="logloss",
nrounds=0.25, max_depth=4, iter=3, trainingFeatures=NULL, unident.th=NULL,
Expand Down Expand Up @@ -235,6 +239,7 @@ scDblFinder <- function(
.checkPropArg(propMarkers)
.checkPropArg(propRandom)
.checkPropArg(dbr.sd)
.checkPropArg(dbr.per1k)
.checkPropArg(dbr, acceptNull=TRUE)
processing <- .checkProcArg(processing)

Expand Down Expand Up @@ -288,10 +293,10 @@ scDblFinder <- function(
}
out <- tryCatch(
scDblFinder(sce[sel_features,x], clusters=clusters, dims=dims, dbr=dbr,
dbr.sd=dbr.sd, clustCor=clustCor, unident.th=unident.th,
knownDoublets=knownDoublets, knownUse=knownUse,
artificialDoublets=artificialDoublets, k=k,
processing=processing, nfeatures=nfeatures,
dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, clustCor=clustCor,
unident.th=unident.th, knownDoublets=knownDoublets,
knownUse=knownUse, artificialDoublets=artificialDoublets,
k=k, processing=processing, nfeatures=nfeatures,
propRandom=propRandom, includePCs=includePCs,
propMarkers=propMarkers, trainingFeatures=trainingFeatures,
returnType=ifelse(returnType=="counts","counts","table"),
Expand All @@ -311,12 +316,12 @@ scDblFinder <- function(
if(multiSampleMode!="split"){
## score and thresholding
d <- .scDblscore(d, scoreType=score, threshold=threshold, dbr=dbr,
dbr.sd=dbr.sd, max_depth=max_depth, nrounds=nrounds,
iter=iter, BPPARAM=BPPARAM, verbose=verbose,
dbr.sd=dbr.sd, dbr.per1k=dbr.per1k, max_depth=max_depth,
nrounds=nrounds, iter=iter, BPPARAM=BPPARAM,
features=trainingFeatures, unident.th=unident.th,
metric=metric, filterUnidentifiable=removeUnidentifiable,
perSample=multiSampleMode=="singleModelSplitThres",
includeSamples=TRUE)
includeSamples=TRUE, verbose=verbose)
}
if(returnType=="table") return(d)
return(.scDblAddCD(sce, d))
Expand Down Expand Up @@ -465,7 +470,8 @@ scDblFinder <- function(
}

ex <- NULL
if(!is.null(clusters)) ex <- getExpectedDoublets(clusters, dbr)
if(!is.null(clusters)) ex <- getExpectedDoublets(clusters, dbr,
dbr.per1k=dbr.per1k)

if(verbose) message("Evaluating kNN...")
d <- .evaluateKNN(pca, ctype, ado2, expected=ex, k=k)
Expand All @@ -492,10 +498,10 @@ scDblFinder <- function(
includePCs <- includePCs[includePCs<ncol(pca)]
d <- .scDblscore(d, scoreType=score, addVals=pca[,includePCs,drop=FALSE],
threshold=threshold, dbr=dbr, dbr.sd=dbr.sd, nrounds=nrounds,
max_depth=max_depth, iter=iter, BPPARAM=BPPARAM,
dbr.per1k=dbr.per1k, max_depth=max_depth, iter=iter,
features=trainingFeatures, verbose=verbose, metric=metric,
filterUnidentifiable=removeUnidentifiable,
unident.th=unident.th)
unident.th=unident.th, BPPARAM=BPPARAM)

#if(characterize) d <- .callDblType(d, pca, knn=knn, origins=ado2)
if(returnType=="table") return(d)
Expand Down Expand Up @@ -590,10 +596,11 @@ scDblFinder <- function(
#' @importFrom stats predict quantile
.scDblscore <- function(d, scoreType="xgb", nrounds=NULL, max_depth=5, iter=2,
threshold=TRUE, verbose=TRUE, dbr=NULL, dbr.sd=NULL,
features=NULL, filterUnidentifiable=TRUE, addVals=NULL,
metric="logloss", eta=0.3, BPPARAM=SerialParam(),
includeSamples=FALSE, perSample=TRUE, unident.th=0.1, ...){
gdbr <- .gdbr(d, dbr)
dbr.per1k=dbr.per1k, features=NULL, addVals=NULL,
filterUnidentifiable=TRUE, metric="logloss", eta=0.3,
BPPARAM=SerialParam(), includeSamples=FALSE,
perSample=TRUE, unident.th=0.1, ...){
gdbr <- .gdbr(d, dbr, dbr.per1k=dbr.per1k)
if(!is.null(d$sample) && length(unique(d$sample))==1) d$sample <- NULL
if(is.null(dbr.sd)) dbr.sd <- 0.3*gdbr+0.025
if(scoreType=="xgb"){
Expand Down Expand Up @@ -650,8 +657,8 @@ scDblFinder <- function(
# as well as unidentifiable artificial doublets
w1 <- which(d$type=="real" &
doubletThresholding(d, dbr=dbr, dbr.sd=dbr.sd, stringency=0.7,
perSample=perSample,
returnType="call")=="doublet")
dbr.per1k=dbr.per1k, perSample=perSample,
returnType="call")=="doublet")
if(length(w1) > sum(d$type=="real")/3){
# enforce max prop excluded
w1 <- head(order(d$type!="real", -d$score),
Expand Down Expand Up @@ -700,8 +707,8 @@ scDblFinder <- function(
}
d <- DataFrame(d)
if(threshold){
th <- doubletThresholding( d, dbr=dbr, dbr.sd=dbr.sd, perSample=perSample,
... )
th <- doubletThresholding( d, dbr=dbr, dbr.sd=dbr.sd, dbr.per1k=dbr.per1k,
perSample=perSample, ... )
if(!is.null(d$sample) && length(th)>1){
d$class <- ifelse(d$score >= th[d$sample], "doublet", "singlet")
}else{
Expand All @@ -711,7 +718,7 @@ scDblFinder <- function(
## set class of known (i.e. inputted) doublets:
d$class[d$src=="real" & d$type=="doublet"] <- "doublet"
if(!is.null(d$mostLikelyOrigin)){
th.stats <- .getDoubletStats(d, th, dbr, dbr.sd)
th.stats <- .getDoubletStats(d, th, dbr, dbr.sd, dbr.per1k=dbr.per1k)
metadata(d)$scDblFinder.stats <- th.stats
}
metadata(d)$scDblFinder.threshold <- th
Expand Down
3 changes: 3 additions & 0 deletions man/doubletThresholding.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion man/getExpectedDoublets.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 10 additions & 4 deletions man/scDblFinder.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3bbfa3f

Please sign in to comment.