Merge pull request #22 from JCSzamosi/updates

Merge Updates into Main, 2023-11-21
JCSzamosi · Nov 21, 2023 · 69e8748 · 69e8748
2 parents dc9c07b + 2de5f9f
commit 69e8748
Show file tree

Hide file tree

Showing 81 changed files with 2,623 additions and 379 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,3 +4,5 @@
 ^NAMESPACE-old$
 ^CHANGELOG\.md$
 ^data-raw$
+^benchmark/*
+^ignore_me.R$
diff --git a/.gitignore b/.gitignore
@@ -7,4 +7,4 @@
 test_data/*
 ring_dir
 private
-*.~lock*
+ignore_me.R
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,45 @@
+* 2023-11-21 v0.1.0
+	* **BREAKING CHANGES**
+		* **Completely re-writes `plot_read_depth()`.** 
+			* allows users to plot read depth with a variable on the X axis and
+			a colour parameter
+			* users can access the old function with `plt_read_depth()`
+			temporarily, but this will be removed before the next full release.
+		* **`rank_abund()` is broken and is no longer exported.** Please file a
+		bug report if you were using this function.
+	* exports `order_taxa()`, by request 
+	* makes `prop_tax_down()` slightly more efficient by checking up front if
+	there is nothing to do.
+	* deprecates `order_levs()` because it isn't used anywhere. Its intended
+	function is performed by `order_taxa()`.
+	* introduces visual and automatic testing of the new `plot_read_depth()`
+	function.
+	* in `plot_tax_bar()` 
+		* the `legloc` argument is now passed directly to
+		`ggplot2::theme(legend.position)` and can take any value that can take.
+		* added a `r_ticks` argument. FALSE by default (default behaviour
+		is unchanged). If TRUE, the tick text on the x-axis is rotated 90
+		degrees and reads down to up.
+		* introduced improved functionality when a custom colour vector is used,
+		with and without names
+		* introduce a `leglen` option to allow the user to limit how many taxa
+		are displayed in the legend without removing any taxa from the plot.
+		* soft-deprecate the `yscale` argument. Will stop supporting non-linear
+		y-axes soon
+		* improve error when the `rank` argument is missing from the input dat
+		frame
+		* introduce a warning when the per-sample abundaces sum to greater than
+		1 but the `mean` argument is not set to `TRUE`.
+		* prep the function so I can stop exporting the whole `ggplot2`
+		namespace
+	* introduce lifecycle management with the `lifecycle()` package
+	* start using roxygen2md to use Markdown in documentation.
+	* introduce the `benchmark` folder which contains "good" plotting outputs
+	against which new versions of the package can be tested. Created a .Rmd
+	file in that folder which tests `plot_tax_bar()`.
+	* remove the files that held the old colour vectors
+
+
 * 2023-04-14 v0.0.1 (was v1.0.1)
 	* The multiple colour vectors have been replaced with a single object,
 	`tax_colours`, which will cycle if there are more than 30 taxa. Having more

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: AfterSl1p
 Type: Package
 Title: Generate Summary Graphics and Basic Analysis of 16s Data
-Version: 0.0.1
+Version: 0.1.0
 Author: J. C. Szamosi and Shahrokh Shekarriz
 Authors@R: c(person("JC", "Szamosi", email = "[email protected]",
                   role = c('aut','cre')), 
@@ -19,6 +19,7 @@ Depends:
 Imports:
     dplyr (>= 0.7.2),
     ggplot2 (>= 2.2.1),
+    lifecycle,
     phyloseq (>= 1.19.1),
     pipeR,
     rlang (>= 0.1.2),
@@ -28,3 +29,4 @@ RoxygenNote: 7.2.3
 Suggests: 
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,12 +4,15 @@ export(long_distance_df)
 export(make_ord_df)
 export(make_phy_df)
 export(order_levs)
+export(order_taxa)
 export(plot_read_depth)
 export(plot_tax_bar)
 export(plt_ord)
+export(plt_read_depth)
 export(prop_tax_down)
-export(rank_abund)
 export(rotate_ticks)
+export(tax_colours)
 import(ggplot2)
 import(pipeR)
 import(rlang)
+importFrom(lifecycle,deprecated)
diff --git a/R/AfterSl1p-package.R b/R/AfterSl1p-package.R
@@ -2,8 +2,9 @@
 "_PACKAGE"
 
 ## usethis namespace: start
-#' @import rlang
-#' @import pipeR
 #' @import ggplot2
+#' @import pipeR
+#' @import rlang
+#' @importFrom lifecycle deprecated
 ## usethis namespace: end
 NULL
diff --git a/R/data.R b/R/data.R
@@ -2,13 +2,15 @@
 #'
 #' A vector of colours for use with taxa bar charts.
 #'
-#' @section Details: There are 30, and grey will be added by the
-#'   \code{plot_tax_bar} function for the "Other" category. If a plot calls for
-#'   more than 30 colours, this will just recycle. That is usually fine because
-#'   low-abundance stuff can't be seen anyway, but if you have a situation where
-#'   you have more than 30 things that actually need to be distinguished, you'll
-#'   need to provide your own vector. Also, if you are in that situation, try to
-#'   stop.
+#' @section Details: There are 30 colours, and grey will be added by the
+#'   [plot_tax_bar()] function for the "Other" category. If a plot
+#'   calls for more than 30 colours, this will just recycle. That is usually
+#'   fine because low-abundance stuff can't be seen anyway, but if you have a
+#'   situation where you have more than 30 things that actually need to be
+#'   distinguished, you'll need to provide your own vector. Also, if you are in
+#'   that situation, try to find another way to do what you are doing. People
+#'   cannot generally distinguish anywhere near 30 colours on a single plot.
+#' @export
 tax_colours = c("#87c5ab","#eea27c","#a9a8d2","#ffff99","#9999ff","#fb8072",
                 "#80b1d3","#fdb462","#b3de69","#fccde5","#bc80bd","#ccebc5",
                 "#ffed6f","#a6cee3","#fb9a99","#fdbf6f","#d38d99","#b3b3ff",

diff --git a/R/long_distance_df.R b/R/long_distance_df.R
@@ -3,13 +3,13 @@
 #' Create a long data frame of among-sample distances
 #'
 #'
-#' \code{long_distance_df} creates a long data frame of all the pairwise
+#' `long_distance_df` creates a long data frame of all the pairwise
 #' distances from a sample distance matrix (e.g. the output of
-#' \code{\link{phyloseq::distance}}) with all the metadata listed for each sample.
-#' Allows for easy within- and among-group boxplots, or whatever other
-#' comparisons are of interest.
+#' [phyloseq::distance()]) with all the metadata listed for
+#' each sample. Allows for easy within- and among-group boxplots, or whatever
+#' other comparisons are of interest.
 #'
-#' @section Value: A data frame \eqn{N(N-1)} (or \eqn{N^2} if \code{diag = TRUE}
+#' @section Value: A data frame \eqn{N(N-1)} (or \eqn{N^2} if `diag = TRUE`
 #'   is set) rows (where N is the number of samples) with sample IDs, metadata,
 #'   and pairwise distances listed for each pair of samples. Sample ID and
 #'   metadata columns have '1' or '2' appended to them so the user can tell
@@ -18,22 +18,22 @@
 #'   names as row and column names.
 #' @param metadat A data frame or data frame-like object with the data set's
 #'   metadata
-#' @param idcol (\code{'X.SampleID'}.) A string. The column in \code{metadat}
+#' @param idcol (`'X.SampleID'`.) A string. The column in `metadat`
 #'   that holds the sample names. Sample names should match the row/column namse
 #'   of the distance matrix. If there are samples in the metadata data frame
 #'   that are missing from the distance matrix, they will be excluded with a
 #'   warning. If there are samples in the distance matrix that are missing from
 #'   the metadata, you will get an error.
-#' @param diag (\code{FALSE}.) Logical. Whether the diagonal elements (zeros in
+#' @param diag (`FALSE`.) Logical. Whether the diagonal elements (zeros in
 #'   a distance matrix) should be included in the long data frame. Defaults to
-#'   \code{FALSE} because we almost never want them.
-#' @param suff (\code{c('1','2')}.) A character vector of length 2. The suffixes
+#'   `FALSE` because we almost never want them.
+#' @param suff (`c('1','2')`.) A character vector of length 2. The suffixes
 #'   to be appended to the metadata column names in the output. The two elements
 #'   must not be identical.
-#' @param distcol (\code{'Distance'}.) A string. The desired column name for the
+#' @param distcol (`'Distance'`.) A string. The desired column name for the
 #'   distance column in your long data frame. Only here to avoid clashes with
 #'   existing metadata column names.
-#' @param baseline (\code{'NULL'}). A dataframe whose column names must also be
+#' @param baseline (`'NULL'`). A dataframe whose column names must also be
 #'   column names in the metadat data frame, and whose rows contain a subset of
 #'   the possible values/combinations. If this parameter is used, all the
 #'   samples whose metadata matches a row in this data frame will end up in
@@ -79,13 +79,9 @@ long_distance_df = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
     return(lddf)
 }
 
-
-
-## Functions to generate distance bar charts -----------------------------------
-
 ### lddf_check -----------------------------------------------------------------
 
-#' Check the inputs of \code{long_distance_df()}
+#' Check the inputs of `long_distance_df()`
 #'
 #' For internal use only
 lddf_check = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
@@ -131,26 +127,28 @@ lddf_check = function(dmat, metadat, idcol = 'X.SampleID', diag = FALSE,
 
 #' Does the actual gathering and spreading without testing assumptions
 #'
-#' \code{lddf_work} Does the actual gathering, spreading, and joining associated
-#' with making the lddf, but without checking if the distance matrix is sensible
-#' or removing diagonals and repeats. This is for when you know what you're
-#' doing and have trimmed your distance matrix down to only what you know you
-#' need. Good for permutation tests.
+#' `lddf_work` Used internally by `long_distance_df()`. I recommend
+#' you use that function unless you really know what you're doing. This function
+#' does the actual gathering, spreading, and joining associated with making the
+#' lddf, but without checking if the distance matrix is sensible or removing
+#' diagonals and repeats. Use this function if you know exactly what you want
+#' and have trimmed your distance matrix down to only what you know you need.
+#' Good for permutation tests.
 #'
 #' @param dmat A distance matrix or other diagonal matrix object with sample
 #'   names as row and column names.
 #' @param metadat A data frame or data frame-like object with the data set's
 #'   metadata
-#' @param idcol (\code{'X.SampleID'}.) A string. The column in \code{metadat}
+#' @param idcol (`'X.SampleID'`.) A string. The column in `metadat`
 #'   that holds the sample names. Sample names should match the row/column namse
 #'   of the distance matrix. If there are samples in the metadata data frame
 #'   that are missing from the distance matrix, they will be excluded with a
 #'   warning. If there are samples in the distance matrix that are missing from
 #'   the metadata, you will get an error.
-#' @param suff (\code{c('1','2')}.) A character vector of length 2. The suffixes
+#' @param suff (`c('1','2')`.) A character vector of length 2. The suffixes
 #'   to be appended to the metadata column names in the output. The two elements
 #'   must not be identical.
-#' @param distcol (\code{'Distance'}.) A string. The desired column name for the
+#' @param distcol (`'Distance'`.) A string. The desired column name for the
 #'   distance column in your long data frame. Only here to avoid clashes with
 #'   existing metadata column names.
 lddf_work = function(dmat, metadat, idcol = 'X.SampleID', suff = c('1','2'),

diff --git a/R/make_phy_df.R b/R/make_phy_df.R
@@ -2,7 +2,7 @@
 
 #' Generate a Data Frame for Taxon Bar Charts
 #'
-#' \code{make_phy_df} generates a data frame that is useful for generating taxon
+#' `make_phy_df` generates a data frame that is useful for generating taxon
 #' bar charts.
 #'
 #' @section Details: This function takes a phyloseq object and generates a data
@@ -13,23 +13,23 @@
 #'   abundance, and weird things will happen if it is not.
 #'
 #' @section Value: A data frame similar in structure to that generated by
-#'   \code{psmelt}, but with an 'Other' category added and taxon levels ordered
-#'   for use in plotting.
+#'   [phyloseq::psmelt()], but with an 'Other' category added and
+#'   taxon levels ordered for use in plotting.
 #'
 #' @param physeq A phyloseq object.
 #' @param rank The rank at which to glom taxa. Must be one of 'OTU', 'Genus',
 #'   'Family', 'Order', 'Class', 'Phylum'. Default is 'Genus'.
 #' @param cutoff The abundance cutoff below which taxa are grouped into 'Other'.
 #'   If you don't want anything grouped into 'Other', set this to 0. Default is
 #'   0.001.
-#' @param indic a flag to indicate if the taxon names have level indicators.
-#'   If FALSE, they are added.
+#' @param indic a flag to indicate if the taxon names have level indicators. If
+#'   FALSE, they are added.
 #' @param prop Specifies whether taxa need to be propogated down the taxonomy
-#'   table (default, TRUE) or if this has already been done.
-#' @param count If FALSE (default) the function will expect a relative abundance
-#'   table and create an 'Other' category for taxa below the cutoff (and will
-#'   raise an error if the table is not relative abundance). If TRUE, the
-#'   function will not check for relative abundance and will not create an
+#'   table (default is `TRUE`) or if this has already been done.
+#' @param count If `FALSE` (default) the function will expect a relative
+#'   abundance table and create an 'Other' category for taxa below the cutoff
+#'   (and will raise an error if the table is not relative abundance). If TRUE,
+#'   the function will not check for relative abundance and will not create an
 #'   'Other' category.
 #' @export
 make_phy_df = function(physeq, rank = 'Genus', cutoff = 0.001, indic = FALSE,
@@ -41,7 +41,7 @@ make_phy_df = function(physeq, rank = 'Genus', cutoff = 0.001, indic = FALSE,
         stop('physeq must be a relative abundance table. You have counts > 1.')
     }
     ranks = colnames(phyloseq::tax_table(physeq))
-    if (rank == 'OTU'){
+    if ((rank == 'OTU') & !('OTU' %in% ranks)) {
         ranks = c(ranks, rank)
     }
 
@@ -131,27 +131,28 @@ remain = function(x, tot = 1){
 
 #' Order Taxon Name Factors
 #'
-#' \code{order_taxa} reorders the taxon names in a taxon column (e.g. 'Class' or
+#' `order_taxa()` reorders the taxon names in a taxon column (e.g. 'Class' or
 #' 'Phylum') by the taxon's mean abundance (but always makes sure to put Other
 #' first).
 #'
 #' @section Value: A data frame that is identical to the one given, but with the
 #'   specified column re-ordered by its mean abundance
 #'
 #' @param phy_df A data frame of a phyloseq object, as produced by
-#'   \code{\link{psmelt}} or \code{\link{make_phy_df}}.
+#'   [phyloseq::psmelt()] or [make_phy_df()].
 #' @param rank The name of the column to be re-ordered
 #' @param abund The name of the abundances column. Defaults to 'Abundance'
 #' @param decreasing Specifies whether the taxon order should be based on
 #'   decreasing or increasing abundance. Defaults to FALSE.
+#' @export
 order_taxa = function(phy_df, rank, abund = 'Abundance', decreasing = FALSE){
 
     phy_df[,rank] = factor(phy_df[,rank])
-	phy_df %>%
-        dplyr::filter(UQ(sym(rank)) != 'Other') %>%
-		dplyr::group_by(UQ(sym(rank))) %>%
-		dplyr::summarize(Tot = sum(UQ(sym(abund)))) %>%
-		data.frame() -> total_abunds
+    total_abunds = (phy_df
+                    %>% dplyr::filter(.data[[rank]] != 'Other')
+                    %>% dplyr::group_by(.data[[rank]])
+                    %>%	dplyr::summarize(Tot = sum(.data[[abund]]))
+                    %>%	data.frame())
 
 	lev_ord = levels(droplevels(total_abunds[,rank]))
 	if (decreasing){