From fca03139324ca42f68482d17e5f2983811ea2393 Mon Sep 17 00:00:00 2001 From: Russ Hyde Date: Mon, 19 Dec 2022 18:18:39 +0000 Subject: [PATCH 1/2] refac: function to get mutation-lists from a data.frame defining the clusters --- R/parse_sc0.R | 24 ++++++++++++++++++++++++ R/treeview.R | 12 +++++------- 2 files changed, 29 insertions(+), 7 deletions(-) create mode 100644 R/parse_sc0.R diff --git a/R/parse_sc0.R b/R/parse_sc0.R new file mode 100644 index 0000000..d545059 --- /dev/null +++ b/R/parse_sc0.R @@ -0,0 +1,24 @@ +#' Extract lists of 'defining' and 'all' mutations for each cluster in the data.frame `sc0` +#' +#' @param sc0 data.frame. Must contains columns \code{cluster_id}, \code{defining_mutations} and +#' \code{all_mutations}. The "mutation" columns contain a "|"-separated string of the mutations +#' present in any given cluster. Each row represents a cluster in the phylogeny. +#' +#' @return List of lists. Each entry in the named list corresponds to a cluster in the phylogeny. +#' The inner lists all have entries "defining" and "all", character vectors defining the mutations +#' that are present in the cluster. + +get_mutation_list <- function(sc0) { + required_columns <- c("defining_mutations", "all_mutations", "cluster_id") + stopifnot(all(required_columns %in% colnames(sc0))) + + cmuts <- lapply(seq_len(nrow(sc0)), function(i) { + list( + defining = strsplit(sc0$defining_mutations[i], split = "\\|")[[1]], + all = strsplit(sc0$all_mutations[i], split = "\\|")[[1]] + ) + }) + names(cmuts) <- sc0$cluster_id + + cmuts +} diff --git a/R/treeview.R b/R/treeview.R index 07af5bf..00221a0 100644 --- a/R/treeview.R +++ b/R/treeview.R @@ -47,13 +47,11 @@ treeview <- function(e0, e0 <- readRDS(e0) } sc0 <- e0$Y - cmuts <- lapply(seq_len(nrow(sc0)), function(i) { - list( - defining = strsplit(sc0$defining_mutations[i], split = "\\|")[[1]], - all = strsplit(sc0$all_mutations[i], split = "\\|")[[1]] - ) - }) - names(cmuts) <- sc0$cluster_id + + # 'cmuts' is a list. Each element has entries "defining" and "all". There is one entry for each + # node (row) in sc0 + cmuts <- get_mutation_list(sc0) + tr1 <- e0$tre stopifnot(all(branch_cols %in% colnames(e0$Y))) From 0295c11935f77825cbb1bd6da97bb656ec5dd095 Mon Sep 17 00:00:00 2001 From: Russ Hyde Date: Mon, 19 Dec 2022 20:13:02 +0000 Subject: [PATCH 2/2] feat: create cluster-mutation lookup tables --- DESCRIPTION | 43 ++++++++++++++++++++------------------ R/parse_sc0.R | 25 ++++++++++++++++++++++ R/treeview.R | 4 ++++ inst/WORDLIST | 4 ++++ man/get_mutation_list.Rd | 21 +++++++++++++++++++ man/get_mutation_tables.Rd | 21 +++++++++++++++++++ man/save_sina_plot.Rd | 7 ++++--- man/treeview.Rd | 4 ++++ 8 files changed, 106 insertions(+), 23 deletions(-) create mode 100644 man/get_mutation_list.Rd create mode 100644 man/get_mutation_tables.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 5770599..2133303 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,26 +9,29 @@ Description: A pipeline for scanning a SARS-CoV-2 phylogeny for clades with outl Depends: R (>= 4.1.0) Imports: - ape, - dplyr, - foreach, - ggiraph, - ggplot2, - ggtree, - glue, - htmlwidgets, - knitr, - lubridate, - mgcv, - mlesky, - phangorn (>= 2.9.0), - rlang, - Rmpi, - scales, - stats, - stringr, - treedater, - utils + ape, + dplyr, + foreach, + ggiraph, + ggplot2, + ggtree, + glue, + htmlwidgets, + knitr, + lubridate, + mgcv, + mlesky, + phangorn (>= 2.9.0), + purrr, + readr, + rlang, + Rmpi, + scales, + stats, + stringr, + tibble, + treedater, + utils Suggests: doMPI, svglite, diff --git a/R/parse_sc0.R b/R/parse_sc0.R index d545059..8794d73 100644 --- a/R/parse_sc0.R +++ b/R/parse_sc0.R @@ -22,3 +22,28 @@ get_mutation_list <- function(sc0) { cmuts } + +#' Extract data.frames containing the 'defining' and 'all' mutations for each cluster +#' +#' @inheritParams get_mutation_list +#' +#' @return List of two data.frames with names "all" and "defining". These contain all mutations- +#' and just the defining mutations for each cluster in the phylogeny. The data.frames have +#' identical structure with column names \code{cluster_id} and \code{mutation}. + +get_mutation_tables <- function(sc0) { + mutation_list <- get_mutation_list(sc0) + defining_mutations <- purrr::map_df( + mutation_list, ~ tibble::tibble(mutation = .x[["defining"]]), + .id = "cluster_id" + ) + all_mutations <- purrr::map_df( + mutation_list, ~ tibble::tibble(mutation = .x[["all"]]), + .id = "cluster_id" + ) + + list( + defining = defining_mutations, + all = all_mutations + ) +} diff --git a/R/treeview.R b/R/treeview.R index 00221a0..c07237f 100644 --- a/R/treeview.R +++ b/R/treeview.R @@ -52,6 +52,10 @@ treeview <- function(e0, # node (row) in sc0 cmuts <- get_mutation_list(sc0) + cmut_tables <- get_mutation_tables(sc0) + readr::write_csv(cmut_tables[["defining"]], file.path(output_dir, "defining_mutations.csv")) + readr::write_csv(cmut_tables[["all"]], file.path(output_dir, "all_mutations.csv")) + tr1 <- e0$tre stopifnot(all(branch_cols %in% colnames(e0$Y))) diff --git a/inst/WORDLIST b/inst/WORDLIST index fe20d0b..1da8635 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -40,11 +40,14 @@ phangorn phylo phylodynamics polymorphism +purrr +readr Reformats repo rlang Rmpi RoxygenNote +sc scannint sina stringr @@ -56,6 +59,7 @@ SystemRequirements testthat tfpscanner Thr +tibble tis tooltips treedata diff --git a/man/get_mutation_list.Rd b/man/get_mutation_list.Rd new file mode 100644 index 0000000..81e5417 --- /dev/null +++ b/man/get_mutation_list.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parse_sc0.R +\name{get_mutation_list} +\alias{get_mutation_list} +\title{Extract lists of 'defining' and 'all' mutations for each cluster in the data.frame `sc0`} +\usage{ +get_mutation_list(sc0) +} +\arguments{ +\item{sc0}{data.frame. Must contains columns \code{cluster_id}, \code{defining_mutations} and +\code{all_mutations}. The "mutation" columns contain a "|"-separated string of the mutations +present in any given cluster. Each row represents a cluster in the phylogeny.} +} +\value{ +List of lists. Each entry in the named list corresponds to a cluster in the phylogeny. + The inner lists all have entries "defining" and "all", character vectors defining the mutations + that are present in the cluster. +} +\description{ +Extract lists of 'defining' and 'all' mutations for each cluster in the data.frame `sc0` +} diff --git a/man/get_mutation_tables.Rd b/man/get_mutation_tables.Rd new file mode 100644 index 0000000..0db5a5a --- /dev/null +++ b/man/get_mutation_tables.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/parse_sc0.R +\name{get_mutation_tables} +\alias{get_mutation_tables} +\title{Extract data.frames containing the 'defining' and 'all' mutations for each cluster} +\usage{ +get_mutation_tables(sc0) +} +\arguments{ +\item{sc0}{data.frame. Must contains columns \code{cluster_id}, \code{defining_mutations} and +\code{all_mutations}. The "mutation" columns contain a "|"-separated string of the mutations +present in any given cluster. Each row represents a cluster in the phylogeny.} +} +\value{ +List of two data.frames with names "all" and "defining". These contain all mutations- + and just the defining mutations for each cluster in the phylogeny. The data.frames have + identical structure with column names \code{cluster_id} and \code{mutation}. +} +\description{ +Extract data.frames containing the 'defining' and 'all' mutations for each cluster +} diff --git a/man/save_sina_plot.Rd b/man/save_sina_plot.Rd index 92217fd..4eeae30 100644 --- a/man/save_sina_plot.Rd +++ b/man/save_sina_plot.Rd @@ -20,14 +20,15 @@ save_sina_plot( \item{output_dir}{File path. The directory where the plot will be stored.} -\item{output_format}{Scalar string (either \code{rds} or \code{html}). In which format should -the plot be saved? Default: \code{rds}.} +\item{output_format}{String (either \code{rds}, \code{html} or both). In which formats should +the plots be saved?} \item{width_svg, height_svg}{The width and height of the plot (only used when \code{output_format == "html"}).} } \value{ -Invisibly returns the file path where the plot was saved +Invisibly returns the file paths (one for each output format) where the plots were + saved. } \description{ Saves as either an htmlwidget (in a \code{html} file) or a ggplot object (in a \code{rds} file). diff --git a/man/treeview.Rd b/man/treeview.Rd index 84bdcc9..987a3e0 100644 --- a/man/treeview.Rd +++ b/man/treeview.Rd @@ -10,6 +10,7 @@ treeview( mutations = c("S:A222V", "S:Y145H", "N:Q9L", "S:E484K"), lineages = c("AY\\\\.9", "AY\\\\.43", "AY\\\\.4\\\\.2"), output_dir = "treeview", + sina_output_format = c("rds", "html"), heatmap_width = 0.075, heatmap_lab_offset = -6 ) @@ -28,6 +29,9 @@ logistic growth rate plot will always be produced.} \item{output_dir}{Outputs will be saved in this directory. Will create the directory if it does not exist.} +\item{sina_output_format}{String (either \code{rds}, \code{html} or both). In which formats +should the sina-cluster plots be saved?} + \item{heatmap_width, heatmap_lab_offset}{Width and label-offset parameters for the constructed heatmap.} }