From a41c54ef080f457abddcee58411707450fd8cdfe Mon Sep 17 00:00:00 2001 From: Marc Williams Date: Wed, 2 Oct 2024 11:20:45 +0100 Subject: [PATCH] updates * Default to no filtering due to low bin numbers, filter cells for phasing but do not remove * data.table update * Convert to dt in add_states * Option for male/femal * Do not include chrY in sims * add dplyr tag to filter call * update vignette * fix test due to chrY * fix some warnings from deprecations --------- --- NAMESPACE | 1 + R/callHSCN.R | 51 ++++++++++++++++++++++++++-------- R/clustering.R | 3 +- R/import_packages.R | 3 ++ R/plotting.R | 20 ++++++------- R/sim-data.R | 2 +- R/util.R | 3 +- man/callHaplotypeSpecificCN.Rd | 9 ++++-- tests/testthat/test-util.R | 2 +- vignettes/ASCN.Rmd | 4 +-- 10 files changed, 67 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d32eee9f..0fd66eea 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -98,5 +98,6 @@ export(widen_bins) export(widen_haplotypebins) import(data.table) importFrom(Rcpp,sourceCpp) +importFrom(data.table,":=") importFrom(magrittr,"%>%") useDynLib(signals) diff --git a/R/callHSCN.R b/R/callHSCN.R index baffbddc..ac79f5c8 100644 --- a/R/callHSCN.R +++ b/R/callHSCN.R @@ -634,6 +634,7 @@ filter_haplotypes <- function(haplotypes, fraction){ #' @param fillmissing For bins with missing counts fill in values based on neighbouring bins #' @param global_phasing_for_diploid When using cluster_per_chr, use all cells for phasing diploid regions within the cluster #' @param chrs_for_global_phasing Which chromosomes to phase using all cells for diploid regions, default is NULL which uses all chromosomes +#' @param female Default is `TRUE`, if set to `FALSE` and patient is "XY", X chromosome states are set to A|0 where A=Hmmcopy state #' #' @return Haplotype specific copy number object #' @@ -677,8 +678,8 @@ callHaplotypeSpecificCN <- function(CNbins, phasebyarm = FALSE, minfrachaplotypes = 0.7, likelihood = "auto", - minbins = 100, - minbinschr = 10, + minbins = 0, + minbinschr = 0, phased_haplotypes = NULL, clustering_method = "copy", maxloherror = 0.035, @@ -691,7 +692,8 @@ callHaplotypeSpecificCN <- function(CNbins, smoothsingletons = TRUE, fillmissing = TRUE, global_phasing_for_balanced = FALSE, - chrs_for_global_phasing = NULL) { + chrs_for_global_phasing = NULL, + female = TRUE) { if (!clustering_method %in% c("copy", "breakpoints")) { stop("Clustering method must be one of copy or breakpoints") } @@ -720,6 +722,17 @@ callHaplotypeSpecificCN <- function(CNbins, haplotypes$chr <- sub("chr", "", haplotypes$chr) } + if (female == TRUE){ + #do not infer states for chr "Y" + #haplotypes <- dplyr::filter(haplotypes, chr != "Y") + #CNbins <- dplyr::filter(CNbins, chr != "Y") + } else{ + #do not infer states for chr "X" or "Y" + #haplotypes <- dplyr::filter(haplotypes, chr != "Y") + haplotypes <- dplyr::filter(haplotypes, chr != "X") + #CNbins <- dplyr::filter(CNbins, chr != "Y") + } + nhaplotypes <- haplotypes %>% dplyr::group_by(cell_id) %>% dplyr::summarize(n = sum(totalcounts)) %>% @@ -794,18 +807,21 @@ callHaplotypeSpecificCN <- function(CNbins, cells_to_remove <- ave_dist %>% dplyr::filter(average_distance >= 0.05) message(paste0("Removing ", dim(cells_to_remove)[1], " cells for phasing")) - ascn <- ascn %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id) - haplotypes <- haplotypes %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id) + ascn_filt <- ascn %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id) + haplotypes_filt <- haplotypes %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id) + } else{ + ascn_filt <- ascn + haplotypes_filt <- haplotypes } - infloherror <- ascn %>% + infloherror <- ascn_filt %>% dplyr::filter(state_phase == "A-Hom") %>% dplyr::summarise(err = weighted.mean(x = BAF, w = totalcounts, na.rm = TRUE)) %>% dplyr::pull(err) infloherror <- min(infloherror, maxloherror) # ensure loh error rate is < maxloherror if (likelihood == "betabinomial" | likelihood == "auto") { - bbfit <- fitBB(ascn) + bbfit <- fitBB(ascn_filt) if (bbfit$taronesZ > 5) { likelihood <- "betabinomial" message(paste0("Tarones Z-score: ", round(bbfit$taronesZ, 3), ", using ", likelihood, " model for inference.")) @@ -824,11 +840,11 @@ callHaplotypeSpecificCN <- function(CNbins, ) } - ascn$balance <- ifelse(ascn$phase == "Balanced", 0, 1) + ascn_filt$balance <- ifelse(ascn_filt$phase == "Balanced", 0, 1) if (is.null(phased_haplotypes)) { - chrlist <- proportion_imbalance(ascn, - haplotypes, + chrlist <- proportion_imbalance(ascn_filt, + haplotypes_filt, phasebyarm = phasebyarm, minfrachaplotypes = minfrachaplotypes, mincells = mincells, @@ -839,8 +855,8 @@ callHaplotypeSpecificCN <- function(CNbins, propdf <- chrlist$propdf chrlist <- chrlist$chrlist phased_haplotypes <- phase_haplotypes_bychr( - ascn = ascn, - haplotypes = haplotypes, + ascn = ascn_filt, + haplotypes = haplotypes_filt, chrlist = chrlist, phasebyarm = phasebyarm, global_phasing_for_balanced = global_phasing_for_balanced, @@ -920,6 +936,17 @@ callHaplotypeSpecificCN <- function(CNbins, tidyr::fill( c("state_min", "A", "B", "state_phase", "state_AS", "state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") %>% dplyr::ungroup() + #add 0|0 states for hom deletions + out[["data"]] <- out[["data"]] %>% + dplyr::mutate(A = ifelse(state == 0, 0, A)) %>% + dplyr::mutate(B = ifelse(state == 0, 0, B)) + if (female == FALSE){ + #for male patients set A == state and B == 0 + out[["data"]] <- out[["data"]] %>% + dplyr::mutate(A = ifelse(chr == "X", state, A)) %>% + dplyr::mutate(B = ifelse(chr == "X", 0, B)) + } + out[["data"]] <- add_states(out[["data"]]) out[["data"]] <- as.data.frame(out[["data"]]) } diff --git a/R/clustering.R b/R/clustering.R index bc9b72e7..8d3bd347 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -92,7 +92,8 @@ umap_clustering <- function(CNbins, ret_model = TRUE, ret_nn = TRUE, pca = pca, - fast_sgd = fast_sgd) + fast_sgd = fast_sgd, + pca_method = "svdr") } ) diff --git a/R/import_packages.R b/R/import_packages.R index 4db9ceea..c5817462 100644 --- a/R/import_packages.R +++ b/R/import_packages.R @@ -1,2 +1,5 @@ #' @import data.table NULL + +#' @importFrom data.table ":=" +NULL diff --git a/R/plotting.R b/R/plotting.R index 099b5063..40d341c2 100644 --- a/R/plotting.R +++ b/R/plotting.R @@ -432,14 +432,14 @@ plot_umap <- function(clustering, bycol = NULL, alphavalue = 0.5, raster = FALSE ) } g <- ggplot2::ggplot(clustering, ggplot2::aes(x = umap1, y = umap2)) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = bycol), alpha = alphavalue) + + ggrastr::geom_point_rast(ggplot2::aes(col = .data[[bycol]]), alpha = alphavalue) + ggplot2::xlab("UMAP 1") + ggplot2::ylab("UMAP 2") + ggplot2::theme_bw() + ggplot2::guides(colour = ggplot2::guide_legend(ncol = 3)) } else { g <- ggplot2::ggplot(clustering, ggplot2::aes(x = umap1, y = umap2)) + - ggplot2::geom_point(ggplot2::aes_string(col = bycol), alpha = alphavalue) + + ggplot2::geom_point(ggplot2::aes(col = .data[[bycol]]), alpha = alphavalue) + ggplot2::xlab("UMAP 1") + ggplot2::ylab("UMAP 2") + ggplot2::theme_bw() + @@ -713,7 +713,7 @@ plotCNprofile <- function(CNbins, dplyr::mutate(state = factor(paste0(state), levels = c(paste0(seq(0, 10, 1)), "11+"))) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) + + ggrastr::geom_point_rast(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "Copy number", breaks = names(statecolpal), @@ -743,7 +743,7 @@ plotCNprofile <- function(CNbins, dplyr::mutate(state = factor(paste0(state), levels = c(paste0(seq(0, 10, 1)), "11+"))) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = 16) + + ggplot2::geom_point(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = 16) + ggplot2::scale_color_manual( name = "Allele Specific CN", breaks = names(statecolpal), @@ -1405,7 +1405,7 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = BAF)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval, shape = shape) + + ggrastr::geom_point_rast(ggplot2::aes(col = .data[[BAFcol]]), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "CN", breaks = names(BAFcolpal), @@ -1440,7 +1440,7 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) + + ggrastr::geom_point_rast(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "Allele Specific CN", breaks = names(statecolpal), @@ -1469,7 +1469,7 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = BAF)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval, shape = shape) + + ggplot2::geom_point(ggplot2::aes(col = .data[[BAFcol]]), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "CN", breaks = names(BAFcolpal), @@ -1504,7 +1504,7 @@ plotCNprofileBAF <- function(cn, dplyr::mutate(state_min = paste0(state_min)) %>% ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) + ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) + - ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) + + ggplot2::geom_point(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = shape) + ggplot2::scale_color_manual( name = "Allele Specific CN", breaks = names(statecolpal), @@ -1730,7 +1730,7 @@ plotBAFperstate <- function(cn, minpts = 250, minfrac = 0.01, maxstate = 10, den cowplot::theme_cowplot() + ggplot2::geom_crossbar( data = allASstates, ggplot2::aes(y = cBAF, ymin = cBAF, ymax = cBAF), - alpha = 0.2, size = 0.2 + alpha = 0.2, linewidth = 0.2 ) + ggplot2::geom_text(data = text_fraction, ggplot2::aes(x = state_AS_phased, y = y, label = pct)) + ggplot2::xlab("") + @@ -1776,7 +1776,7 @@ plot_density_histogram <- function(dat, mystate, rho, nbins = 30, frac = "NA", l ) g <- ggplot2::ggplot(dat, ggplot2::aes(x = BAF)) + - ggplot2::geom_histogram(ggplot2::aes(y = (..density..)), bins = nbins, fill = "azure4", alpha = 0.8) + + ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), bins = nbins, fill = "azure4", alpha = 0.8) + ggplot2::geom_line(data = dffit_bb, stat = "density", ggplot2::aes(BAF, col = type), size = 0.5, adjust = 5) + ggplot2::geom_line(data = dffit_b, stat = "density", ggplot2::aes(BAF, col = type), size = 0.5, adjust = 5, linetype = 2) + ggplot2::scale_colour_manual(values = c(ggplot2::alpha("deepskyblue4", 0.6), ggplot2::alpha("firebrick4", 0.6))) + diff --git a/R/sim-data.R b/R/sim-data.R index c956067b..af24c4de 100644 --- a/R/sim-data.R +++ b/R/sim-data.R @@ -30,7 +30,7 @@ simulate_cell <- function(nchr = 2, } data("dlpbins", envir = environment()) - bins <- dlpbins + bins <- dlpbins %>% dplyr::filter(chr != "Y") bins$state <- base_ploidy bins$state_min <- round(base_ploidy / 2) chrvec <- setdiff(unique(bins$chr), names(clonal_events)) diff --git a/R/util.R b/R/util.R index 5b3ed2af..c25b55db 100644 --- a/R/util.R +++ b/R/util.R @@ -75,7 +75,7 @@ createCNmatrix <- function(CNbins, cnmatrix <- cnmatrix %>% dplyr::as_tibble() %>% - tidyr::fill(., colnames, .direction = "updown") + tidyr::fill(., tidyr::all_of(colnames), .direction = "updown") } cnmatrix <- as.data.frame(cnmatrix) @@ -962,6 +962,7 @@ per_chr_cn <- function(hscn, arms = NULL) { #' @export add_states <- function(df) { df <- df %>% + data.table::as.data.table() %>% .[, state_AS_phased := paste0(A, "|", B)] %>% .[, state_AS := paste0(pmax(state - B, B), "|", pmin(state - B, B))] %>% .[, state_min := pmin(A, B)] %>% diff --git a/man/callHaplotypeSpecificCN.Rd b/man/callHaplotypeSpecificCN.Rd index 1e8ac40c..e98843ac 100644 --- a/man/callHaplotypeSpecificCN.Rd +++ b/man/callHaplotypeSpecificCN.Rd @@ -17,8 +17,8 @@ callHaplotypeSpecificCN( phasebyarm = FALSE, minfrachaplotypes = 0.7, likelihood = "auto", - minbins = 100, - minbinschr = 10, + minbins = 0, + minbinschr = 0, phased_haplotypes = NULL, clustering_method = "copy", maxloherror = 0.035, @@ -31,7 +31,8 @@ callHaplotypeSpecificCN( smoothsingletons = TRUE, fillmissing = TRUE, global_phasing_for_balanced = FALSE, - chrs_for_global_phasing = NULL + chrs_for_global_phasing = NULL, + female = TRUE ) } \arguments{ @@ -85,6 +86,8 @@ callHaplotypeSpecificCN( \item{chrs_for_global_phasing}{Which chromosomes to phase using all cells for diploid regions, default is NULL which uses all chromosomes} +\item{female}{Default is \code{TRUE}, if set to \code{FALSE} and patient is "XY", X chromosome states are set to A|0 where A=Hmmcopy state} + \item{viterbver}{Version of viterbi algorithm to use (cpp or R)} \item{global_phasing_for_diploid}{When using cluster_per_chr, use all cells for phasing diploid regions within the cluster} diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index 9e16c12f..3d0188ed 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -39,7 +39,7 @@ test_that("Test widen bins functions", { segs <- create_segments(sim_data_bb$CNbins) test_that("Test create segments", { - expect_equal(dim(segs)[1], 24 * sum(clones_dist)) + expect_equal(dim(segs)[1], 23 * sum(clones_dist)) }) segs_cn <- create_segments(consensuscopynumber(CNbins)) diff --git a/vignettes/ASCN.Rmd b/vignettes/ASCN.Rmd index dc0428c2..11969092 100644 --- a/vignettes/ASCN.Rmd +++ b/vignettes/ASCN.Rmd @@ -97,9 +97,9 @@ There are also some function to plot per cell BAF and state plots. You can speci plotCNprofileBAF(hscn, cellid = "SA921-A90554A-R03-C44") ``` -Here we'll plot an equivalent plot merging data across cell clusters using the utility function `consensyscopynumber`. Here, the `cell_id` column becomes the -```{r, fig.show='hold', fig.height=5 , fig.width=10} +Here we'll plot an equivalent plot merging data across cell clusters using the utility function `consensyscopynumber`. Here, the `cell_id` column becomes the `clone_id`. +```{r, fig.show='hold', fig.height=5 , fig.width=10} consensus_clusters <- consensuscopynumber(hscn$data, cl = cl$clustering) plotCNprofileBAF(consensus_clusters, cellid = "A")