From a41c54ef080f457abddcee58411707450fd8cdfe Mon Sep 17 00:00:00 2001
From: Marc Williams <marcjwilliams1@gmail.com>
Date: Wed, 2 Oct 2024 11:20:45 +0100
Subject: [PATCH] updates

* Default to no filtering due to low bin numbers, filter cells for phasing but do not remove

* data.table update

* Convert to dt in add_states

* Option for male/femal

* Do not include chrY in sims

* add dplyr tag to filter call

* update vignette

* fix test due to chrY

* fix some warnings from deprecations

---------
---
 NAMESPACE                      |  1 +
 R/callHSCN.R                   | 51 ++++++++++++++++++++++++++--------
 R/clustering.R                 |  3 +-
 R/import_packages.R            |  3 ++
 R/plotting.R                   | 20 ++++++-------
 R/sim-data.R                   |  2 +-
 R/util.R                       |  3 +-
 man/callHaplotypeSpecificCN.Rd |  9 ++++--
 tests/testthat/test-util.R     |  2 +-
 vignettes/ASCN.Rmd             |  4 +--
 10 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index d32eee9f..0fd66eea 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -98,5 +98,6 @@ export(widen_bins)
 export(widen_haplotypebins)
 import(data.table)
 importFrom(Rcpp,sourceCpp)
+importFrom(data.table,":=")
 importFrom(magrittr,"%>%")
 useDynLib(signals)
diff --git a/R/callHSCN.R b/R/callHSCN.R
index baffbddc..ac79f5c8 100644
--- a/R/callHSCN.R
+++ b/R/callHSCN.R
@@ -634,6 +634,7 @@ filter_haplotypes <- function(haplotypes, fraction){
 #' @param fillmissing For bins with missing counts fill in values based on neighbouring bins
 #' @param global_phasing_for_diploid When using cluster_per_chr, use all cells for phasing diploid regions within the cluster
 #' @param chrs_for_global_phasing Which chromosomes to phase using all cells for diploid regions, default is NULL which uses all chromosomes
+#' @param female Default is `TRUE`, if set to `FALSE` and patient is "XY", X chromosome states are set to A|0 where A=Hmmcopy state
 #' 
 #' @return Haplotype specific copy number object 
 #' 
@@ -677,8 +678,8 @@ callHaplotypeSpecificCN <- function(CNbins,
                                     phasebyarm = FALSE,
                                     minfrachaplotypes = 0.7,
                                     likelihood = "auto",
-                                    minbins = 100,
-                                    minbinschr = 10,
+                                    minbins = 0,
+                                    minbinschr = 0,
                                     phased_haplotypes = NULL,
                                     clustering_method = "copy",
                                     maxloherror = 0.035,
@@ -691,7 +692,8 @@ callHaplotypeSpecificCN <- function(CNbins,
                                     smoothsingletons = TRUE,
                                     fillmissing = TRUE,
                                     global_phasing_for_balanced = FALSE,
-                                    chrs_for_global_phasing = NULL) {
+                                    chrs_for_global_phasing = NULL,
+                                    female = TRUE) {
   if (!clustering_method %in% c("copy", "breakpoints")) {
     stop("Clustering method must be one of copy or breakpoints")
   }
@@ -720,6 +722,17 @@ callHaplotypeSpecificCN <- function(CNbins,
     haplotypes$chr <- sub("chr", "", haplotypes$chr)
   }
   
+  if (female == TRUE){
+    #do not infer states for chr "Y"
+    #haplotypes <- dplyr::filter(haplotypes, chr != "Y")
+    #CNbins <- dplyr::filter(CNbins, chr != "Y")
+  } else{
+    #do not infer states for chr "X" or "Y"
+    #haplotypes <- dplyr::filter(haplotypes, chr != "Y")
+    haplotypes <- dplyr::filter(haplotypes, chr != "X")
+    #CNbins <- dplyr::filter(CNbins, chr != "Y")
+  }
+  
   nhaplotypes <- haplotypes %>% 
     dplyr::group_by(cell_id) %>% 
     dplyr::summarize(n = sum(totalcounts)) %>% 
@@ -794,18 +807,21 @@ callHaplotypeSpecificCN <- function(CNbins,
     cells_to_remove <- ave_dist %>% 
       dplyr::filter(average_distance >= 0.05)
     message(paste0("Removing ", dim(cells_to_remove)[1], " cells for phasing"))
-    ascn <- ascn %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id)
-    haplotypes <- haplotypes %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id)
+    ascn_filt <- ascn %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id)
+    haplotypes_filt <- haplotypes %>% dplyr::filter(cell_id %in% cells_to_keep$cell_id)
+  } else{
+    ascn_filt <- ascn
+    haplotypes_filt <- haplotypes
   }
   
-  infloherror <- ascn %>%
+  infloherror <- ascn_filt %>%
     dplyr::filter(state_phase == "A-Hom") %>%
     dplyr::summarise(err = weighted.mean(x = BAF, w = totalcounts, na.rm = TRUE)) %>%
     dplyr::pull(err)
   infloherror <- min(infloherror, maxloherror) # ensure loh error rate is < maxloherror
   
   if (likelihood == "betabinomial" | likelihood == "auto") {
-    bbfit <- fitBB(ascn)
+    bbfit <- fitBB(ascn_filt)
     if (bbfit$taronesZ > 5) {
       likelihood <- "betabinomial"
       message(paste0("Tarones Z-score: ", round(bbfit$taronesZ, 3), ", using ", likelihood, " model for inference."))
@@ -824,11 +840,11 @@ callHaplotypeSpecificCN <- function(CNbins,
     )
   }
   
-  ascn$balance <- ifelse(ascn$phase == "Balanced", 0, 1)
+  ascn_filt$balance <- ifelse(ascn_filt$phase == "Balanced", 0, 1)
   
   if (is.null(phased_haplotypes)) {
-    chrlist <- proportion_imbalance(ascn,
-                                    haplotypes,
+    chrlist <- proportion_imbalance(ascn_filt,
+                                    haplotypes_filt,
                                     phasebyarm = phasebyarm,
                                     minfrachaplotypes = minfrachaplotypes,
                                     mincells = mincells,
@@ -839,8 +855,8 @@ callHaplotypeSpecificCN <- function(CNbins,
     propdf <- chrlist$propdf
     chrlist <- chrlist$chrlist
     phased_haplotypes <- phase_haplotypes_bychr(
-      ascn = ascn,
-      haplotypes = haplotypes,
+      ascn = ascn_filt,
+      haplotypes = haplotypes_filt,
       chrlist = chrlist,
       phasebyarm = phasebyarm,
       global_phasing_for_balanced = global_phasing_for_balanced,
@@ -920,6 +936,17 @@ callHaplotypeSpecificCN <- function(CNbins,
       tidyr::fill( c("state_min", "A", "B", "state_phase", "state_AS", 
                     "state_AS_phased", "LOH", "state_BAF", "phase"), .direction = "downup") %>% 
       dplyr::ungroup()
+    #add 0|0 states for  hom deletions
+    out[["data"]] <- out[["data"]] %>% 
+      dplyr::mutate(A = ifelse(state == 0, 0, A)) %>% 
+      dplyr::mutate(B = ifelse(state == 0, 0, B))
+    if (female == FALSE){
+      #for male patients set A == state and B == 0
+      out[["data"]] <- out[["data"]] %>% 
+        dplyr::mutate(A = ifelse(chr == "X", state, A)) %>% 
+        dplyr::mutate(B = ifelse(chr == "X", 0, B))
+    }
+    out[["data"]] <- add_states(out[["data"]])
     out[["data"]] <- as.data.frame(out[["data"]])
   }
   
diff --git a/R/clustering.R b/R/clustering.R
index bc9b72e7..8d3bd347 100644
--- a/R/clustering.R
+++ b/R/clustering.R
@@ -92,7 +92,8 @@ umap_clustering <- function(CNbins,
                                 ret_model = TRUE,
                                 ret_nn = TRUE,
                                 pca = pca,
-                                fast_sgd = fast_sgd)
+                                fast_sgd = fast_sgd,
+                                pca_method = "svdr")
     }
   )
   
diff --git a/R/import_packages.R b/R/import_packages.R
index 4db9ceea..c5817462 100644
--- a/R/import_packages.R
+++ b/R/import_packages.R
@@ -1,2 +1,5 @@
 #' @import data.table
 NULL
+
+#' @importFrom data.table ":="
+NULL
diff --git a/R/plotting.R b/R/plotting.R
index 099b5063..40d341c2 100644
--- a/R/plotting.R
+++ b/R/plotting.R
@@ -432,14 +432,14 @@ plot_umap <- function(clustering, bycol = NULL, alphavalue = 0.5, raster = FALSE
       )
     }
     g <- ggplot2::ggplot(clustering, ggplot2::aes(x = umap1, y = umap2)) +
-      ggrastr::geom_point_rast(ggplot2::aes_string(col = bycol), alpha = alphavalue) +
+      ggrastr::geom_point_rast(ggplot2::aes(col = .data[[bycol]]), alpha = alphavalue) +
       ggplot2::xlab("UMAP 1") +
       ggplot2::ylab("UMAP 2") +
       ggplot2::theme_bw() +
       ggplot2::guides(colour = ggplot2::guide_legend(ncol = 3))
   } else {
     g <- ggplot2::ggplot(clustering, ggplot2::aes(x = umap1, y = umap2)) +
-      ggplot2::geom_point(ggplot2::aes_string(col = bycol), alpha = alphavalue) +
+      ggplot2::geom_point(ggplot2::aes(col = .data[[bycol]]), alpha = alphavalue) +
       ggplot2::xlab("UMAP 1") +
       ggplot2::ylab("UMAP 2") +
       ggplot2::theme_bw() +
@@ -713,7 +713,7 @@ plotCNprofile <- function(CNbins,
       dplyr::mutate(state = factor(paste0(state), levels = c(paste0(seq(0, 10, 1)), "11+"))) %>%
       ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) +
       ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) +
-      ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) +
+      ggrastr::geom_point_rast(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = shape) +
       ggplot2::scale_color_manual(
         name = "Copy number",
         breaks = names(statecolpal),
@@ -743,7 +743,7 @@ plotCNprofile <- function(CNbins,
       dplyr::mutate(state = factor(paste0(state), levels = c(paste0(seq(0, 10, 1)), "11+"))) %>%
       ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) +
       ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) +
-      ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = 16) +
+      ggplot2::geom_point(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = 16) +
       ggplot2::scale_color_manual(
         name = "Allele Specific CN",
         breaks = names(statecolpal),
@@ -1405,7 +1405,7 @@ plotCNprofileBAF <- function(cn,
       dplyr::mutate(state_min = paste0(state_min)) %>%
       ggplot2::ggplot(ggplot2::aes(x = idx, y = BAF)) +
       ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) +
-      ggrastr::geom_point_rast(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval, shape = shape) +
+      ggrastr::geom_point_rast(ggplot2::aes(col = .data[[BAFcol]]), size = pointsize, alpha = alphaval, shape = shape) +
       ggplot2::scale_color_manual(
         name = "CN",
         breaks = names(BAFcolpal),
@@ -1440,7 +1440,7 @@ plotCNprofileBAF <- function(cn,
       dplyr::mutate(state_min = paste0(state_min)) %>%
       ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) +
       ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) +
-      ggrastr::geom_point_rast(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) +
+      ggrastr::geom_point_rast(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = shape) +
       ggplot2::scale_color_manual(
         name = "Allele Specific CN",
         breaks = names(statecolpal),
@@ -1469,7 +1469,7 @@ plotCNprofileBAF <- function(cn,
       dplyr::mutate(state_min = paste0(state_min)) %>%
       ggplot2::ggplot(ggplot2::aes(x = idx, y = BAF)) +
       ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) +
-      ggplot2::geom_point(ggplot2::aes_string(col = BAFcol), size = pointsize, alpha = alphaval, shape = shape) +
+      ggplot2::geom_point(ggplot2::aes(col = .data[[BAFcol]]), size = pointsize, alpha = alphaval, shape = shape) +
       ggplot2::scale_color_manual(
         name = "CN",
         breaks = names(BAFcolpal),
@@ -1504,7 +1504,7 @@ plotCNprofileBAF <- function(cn,
       dplyr::mutate(state_min = paste0(state_min)) %>%
       ggplot2::ggplot(ggplot2::aes(x = idx, y = copy)) +
       ggplot2::geom_vline(xintercept = pl$chrbreaks, col = "grey90", alpha = 0.75) +
-      ggplot2::geom_point(ggplot2::aes_string(col = statecol), size = pointsize, alpha = alphaval, shape = shape) +
+      ggplot2::geom_point(ggplot2::aes(col = .data[[statecol]]), size = pointsize, alpha = alphaval, shape = shape) +
       ggplot2::scale_color_manual(
         name = "Allele Specific CN",
         breaks = names(statecolpal),
@@ -1730,7 +1730,7 @@ plotBAFperstate <- function(cn, minpts = 250, minfrac = 0.01, maxstate = 10, den
     cowplot::theme_cowplot() +
     ggplot2::geom_crossbar(
       data = allASstates, ggplot2::aes(y = cBAF, ymin = cBAF, ymax = cBAF),
-      alpha = 0.2, size = 0.2
+      alpha = 0.2, linewidth = 0.2
     ) +
     ggplot2::geom_text(data = text_fraction, ggplot2::aes(x = state_AS_phased, y = y, label = pct)) +
     ggplot2::xlab("") +
@@ -1776,7 +1776,7 @@ plot_density_histogram <- function(dat, mystate, rho, nbins = 30, frac = "NA", l
   )
 
   g <- ggplot2::ggplot(dat, ggplot2::aes(x = BAF)) +
-    ggplot2::geom_histogram(ggplot2::aes(y = (..density..)), bins = nbins, fill = "azure4", alpha = 0.8) +
+    ggplot2::geom_histogram(ggplot2::aes(y = ggplot2::after_stat(density)), bins = nbins, fill = "azure4", alpha = 0.8) +
     ggplot2::geom_line(data = dffit_bb, stat = "density", ggplot2::aes(BAF, col = type), size = 0.5, adjust = 5) +
     ggplot2::geom_line(data = dffit_b, stat = "density", ggplot2::aes(BAF, col = type), size = 0.5, adjust = 5, linetype = 2) +
     ggplot2::scale_colour_manual(values = c(ggplot2::alpha("deepskyblue4", 0.6), ggplot2::alpha("firebrick4", 0.6))) +
diff --git a/R/sim-data.R b/R/sim-data.R
index c956067b..af24c4de 100644
--- a/R/sim-data.R
+++ b/R/sim-data.R
@@ -30,7 +30,7 @@ simulate_cell <- function(nchr = 2,
   }
 
   data("dlpbins", envir = environment())
-  bins <- dlpbins
+  bins <- dlpbins %>% dplyr::filter(chr != "Y")
   bins$state <- base_ploidy
   bins$state_min <- round(base_ploidy / 2)
   chrvec <- setdiff(unique(bins$chr), names(clonal_events))
diff --git a/R/util.R b/R/util.R
index 5b3ed2af..c25b55db 100644
--- a/R/util.R
+++ b/R/util.R
@@ -75,7 +75,7 @@ createCNmatrix <- function(CNbins,
     
     cnmatrix <- cnmatrix %>%
       dplyr::as_tibble() %>%
-      tidyr::fill(., colnames, .direction = "updown")
+      tidyr::fill(., tidyr::all_of(colnames), .direction = "updown")
   }
 
   cnmatrix <- as.data.frame(cnmatrix)
@@ -962,6 +962,7 @@ per_chr_cn <- function(hscn, arms = NULL) {
 #' @export
 add_states <- function(df) {
   df <- df %>%
+    data.table::as.data.table() %>%
     .[, state_AS_phased := paste0(A, "|", B)] %>%
     .[, state_AS := paste0(pmax(state - B, B), "|", pmin(state - B, B))] %>%
     .[, state_min := pmin(A, B)] %>%
diff --git a/man/callHaplotypeSpecificCN.Rd b/man/callHaplotypeSpecificCN.Rd
index 1e8ac40c..e98843ac 100644
--- a/man/callHaplotypeSpecificCN.Rd
+++ b/man/callHaplotypeSpecificCN.Rd
@@ -17,8 +17,8 @@ callHaplotypeSpecificCN(
   phasebyarm = FALSE,
   minfrachaplotypes = 0.7,
   likelihood = "auto",
-  minbins = 100,
-  minbinschr = 10,
+  minbins = 0,
+  minbinschr = 0,
   phased_haplotypes = NULL,
   clustering_method = "copy",
   maxloherror = 0.035,
@@ -31,7 +31,8 @@ callHaplotypeSpecificCN(
   smoothsingletons = TRUE,
   fillmissing = TRUE,
   global_phasing_for_balanced = FALSE,
-  chrs_for_global_phasing = NULL
+  chrs_for_global_phasing = NULL,
+  female = TRUE
 )
 }
 \arguments{
@@ -85,6 +86,8 @@ callHaplotypeSpecificCN(
 
 \item{chrs_for_global_phasing}{Which chromosomes to phase using all cells for diploid regions, default is NULL which uses all chromosomes}
 
+\item{female}{Default is \code{TRUE}, if set to \code{FALSE} and patient is "XY", X chromosome states are set to A|0 where A=Hmmcopy state}
+
 \item{viterbver}{Version of viterbi algorithm to use (cpp or R)}
 
 \item{global_phasing_for_diploid}{When using cluster_per_chr, use all cells for phasing diploid regions within the cluster}
diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R
index 9e16c12f..3d0188ed 100644
--- a/tests/testthat/test-util.R
+++ b/tests/testthat/test-util.R
@@ -39,7 +39,7 @@ test_that("Test widen bins functions", {
 
 segs <- create_segments(sim_data_bb$CNbins)
 test_that("Test create segments", {
-  expect_equal(dim(segs)[1], 24 * sum(clones_dist))
+  expect_equal(dim(segs)[1], 23 * sum(clones_dist))
 })
 
 segs_cn <- create_segments(consensuscopynumber(CNbins))
diff --git a/vignettes/ASCN.Rmd b/vignettes/ASCN.Rmd
index dc0428c2..11969092 100644
--- a/vignettes/ASCN.Rmd
+++ b/vignettes/ASCN.Rmd
@@ -97,9 +97,9 @@ There are also some function to plot per cell BAF and state plots. You can speci
 plotCNprofileBAF(hscn, cellid = "SA921-A90554A-R03-C44")
 ```
 
-Here we'll plot an equivalent plot merging data across cell clusters using the utility function `consensyscopynumber`. Here, the `cell_id` column becomes the 
-```{r, fig.show='hold', fig.height=5 , fig.width=10}
+Here we'll plot an equivalent plot merging data across cell clusters using the utility function `consensyscopynumber`. Here, the `cell_id` column becomes the `clone_id`.
 
+```{r, fig.show='hold', fig.height=5 , fig.width=10}
 consensus_clusters <- consensuscopynumber(hscn$data, cl = cl$clustering)
 
 plotCNprofileBAF(consensus_clusters, cellid = "A")