v0.3.0 release canidate 1

grunwaldlab · Aug 28, 2018 · 5d3251e · 5d3251e
1 parent 8b4778f
commit 5d3251e
Show file tree

Hide file tree

Showing 35 changed files with 357 additions and 862 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: metacoder
 Title: Tools for Parsing, Manipulating, and Graphing Taxonomic Abundance Data
-Version: 0.2.1.9012
+Version: 0.3.0
 Authors@R: c(person("Zachary", "Foster", email =
         "[email protected]", role = c("aut", "cre")),
         person("Niklaus", "Grunwald", email =
@@ -59,7 +59,7 @@ Suggests:
     zlibbioc
 VignetteBuilder: knitr
 RoxygenNote: 6.1.0
-Date: 2018-05-01
+Date: 2018-08-27
 Encoding: UTF-8
 biocViews: 
 LinkingTo: Rcpp
diff --git a/NAMESPACE b/NAMESPACE
@@ -63,4 +63,5 @@ importFrom(dplyr,num_range)
 importFrom(dplyr,one_of)
 importFrom(dplyr,starts_with)
 importFrom(magrittr,"%>%")
+importFrom(rlang,.data)
 useDynLib(metacoder)
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,6 @@
 # News 
 
-## Current
+##  metacoder 0.3.0
 
 ### Bug fixes
 
@@ -13,7 +13,7 @@
 
 * Added `more_than` option to `calc_n_samples` so that users can set the minimum threshold for whether a sample is counted or not instead of it always 1.
 * Added `calc_prop_samples` function for calculating the proportion of samples with a value greater than 0 (issues [#233](https://github.com/grunwaldlab/metacoder/issues/233).
-* primersearch is faster and takes less memory by using `ape::DNAbin` objects internally.
+* `primersearch` is faster and takes less memory by using `ape::DNAbin` objects internally.
 * Made `calc_taxon_abund` about 5x faster.
 
 ### New features
@@ -24,7 +24,8 @@
 ### Changes
 
 * `primersearch` now takes and returns a `taxmap` object with results added as tables. `primersearch_raw` is a new function that behaves like the old `primersearch` did, returning a table.
-* The `dataset` option of many functions has been renamed to `data` to match the option name in the `taxa` pacakge.
+* The `dataset` option of many functions has been renamed to `data` to match the option name in the `taxa` package.
+* Numerous spelling fixes.
 
 ##  metacoder 0.2.1
 

diff --git a/R/calculations.R b/R/calculations.R
@@ -182,7 +182,7 @@ calc_group_median <- function(obj, data, groups, cols = NULL,
 #'
 #' For a given table in a \code{\link[taxa]{taxmap}} object, apply a function to
 #' rows in groups of columns. The result of the function is used to create new
-#' columns. This is eqivalant to splitting columns of a table by a factor and
+#' columns. This is equivalent to splitting columns of a table by a factor and
 #' using \code{apply} on each group.
 #'
 #' @inheritParams do_calc_on_num_cols
@@ -604,7 +604,7 @@ compare_groups <- function(obj, data, cols, groups,
   # Check groups option
   groups <- check_option_groups(groups, cols)
 
-  # Define defualt function
+  # Define default function
   if (is.null(func)) {
     func <- function(abund_1, abund_2) {
       median_1 <- stats::median(abund_1, na.rm = TRUE)
@@ -971,7 +971,7 @@ calc_prop_samples <- function(obj, data, cols = NULL, groups = "n_samples",
 #'
 #' For a given table in a \code{\link[taxa]{taxmap}} object, apply a function to
 #' rows in groups of columns. The result of the function is used to create new
-#' columns. This is eqivalant to splitting columns of a table by a factor and
+#' columns. This is equivalent to splitting columns of a table by a factor and
 #' using \code{apply} on each group.
 #'
 #' @inheritParams do_calc_on_num_cols

diff --git a/R/heat_tree.R b/R/heat_tree.R
@@ -57,7 +57,7 @@ heat_tree.Taxmap <- function(.input, ...) {
 #' 
 #' Plots the distribution of values associated with a taxonomic classification/heirarchy.
 #' Taxonomic classifications can have multiple roots, resulting in multiple trees on the same plot.
-#' A tree consists of elements, element protperties, conditions, and mapping properties which are
+#' A tree consists of elements, element properties, conditions, and mapping properties which are
 #' represented as parameters in the heat_tree object.
 #' The elements (e.g. nodes, edges, lables, and individual trees) are the infrastructure of the heat tree.
 #' The element properties (e.g. size and color) are characteristics that are manipulated by various 
@@ -139,7 +139,7 @@ heat_tree.Taxmap <- function(.input, ...) {
 #' Default: \code{"area"}.
 #' 
 #' @param node_size_range See details on ranges.
-#' Defualt: Optimize to balance overlaps and range size.
+#' Default: Optimize to balance overlaps and range size.
 #' @param edge_size_range See details on ranges.
 #' Default: relative to node size range. 
 # #' @param tree_size_range See details on ranges.
@@ -220,7 +220,7 @@ heat_tree.Taxmap <- function(.input, ...) {
 #' Default: Do not save plot.
 #' 
 #' @param aspect_ratio The aspect_ratio of the plot.
-#' @param repel_labels If \code{TRUE} (Defualt), use the ggrepel package to spread out labels.
+#' @param repel_labels If \code{TRUE} (Default), use the ggrepel package to spread out labels.
 #' @param repel_force The force of which overlapping labels will be repelled from eachother. 
 #' @param repel_iter The number of iterations used when repelling labels
 #' @param verbose If \code{TRUE} print progress reports as the function runs.
@@ -290,7 +290,7 @@ heat_tree.Taxmap <- function(.input, ...) {
 #' @section ranges:
 #' 
 #' The displayed range of colors and sizes can be explicitly defined or automatically generated.
-#' When explicitely used, the size range will proportionately increase/decrease the size of a particular element.
+#' When explicitly used, the size range will proportionately increase/decrease the size of a particular element.
 #' Size ranges are specified by supplying a \code{numeric} vector with two values: the minimum and maximum.
 #' The units used should be between 0 and 1, representing the proportion of a dimension of the graph.
 #' Since the dimensions of the graph are determined by layout, and not always square, the value
@@ -325,7 +325,7 @@ heat_tree.Taxmap <- function(.input, ...) {
 #' 
 #' This is the minimum and maximum of values displayed on the legend scales.
 #' Intervals are specified by supplying a \code{numeric} vector with two values: the minimum and maximum.
-#' When explicitely used, the <element>_<property>_interval will redefine the way the actual conditional values are being represented
+#' When explicitly used, the <element>_<property>_interval will redefine the way the actual conditional values are being represented
 #' by setting a limit for the <element>_<property>.
 #' Any condition below the minimum <element>_<property>_interval will be graphically represented the same as a condition AT the
 #' minimum value in the full range of conditional values.  Any value above the maximum <element>_<property>_interval will be graphically 

diff --git a/R/heat_tree_matrix.R b/R/heat_tree_matrix.R
@@ -1,6 +1,6 @@
 #' Plot a matrix of heat trees
 #' 
-#' Plot a matrix of heat trees for showing parwise comparisons. A larger,
+#' Plot a matrix of heat trees for showing pairwise comparisons. A larger,
 #' labelled tree serves as a key for the matrix of smaller unlabelled trees. The
 #' data for this function is typically created with \code{\link{compare_groups}},
 #' 

diff --git a/R/option_parsers.R b/R/option_parsers.R
@@ -20,7 +20,7 @@ verify_taxmap <- function(obj) {
 #' Get a data set from a taxmap object
 #' 
 #' NOTE: This will be replaced by the function `get_dataset` in the `taxa`
-#' pacakge. Get a data set from a taxmap object and complain if it does not
+#' package. Get a data set from a taxmap object and complain if it does not
 #' exist. This is intended to be used to parse options in other functions.
 #' 
 #' @param obj A taxmap object

diff --git a/R/parsers.R b/R/parsers.R
@@ -490,7 +490,7 @@ parse_unite_general <- function(input = NULL, file = NULL, include_seqs = TRUE)
 #' @param include_seqs (\code{logical} of length 1) If \code{TRUE}, include 
 #'   sequences in the output object.
 #' @param add_species (\code{logical} of length 1) If \code{TRUE}, add the
-#'   species information to the taxonomy. In this databse, the species name
+#'   species information to the taxonomy. In this database, the species name
 #'   often contains other information as well.
 #'   
 #' @return \code{\link{taxmap}}
@@ -787,3 +787,52 @@ parse_ubiome <- function(file = NULL, table = NULL) {
   return(output)
 }
 
+
+#' Convert a table with an edge list to taxmap
+#'
+#' Converts a table containing an edge list into a [taxa::taxmap()] object.
+#' An "edge list" is two columns in a table, where each row defines a taxon-supertaxon relationship.
+#' The contents of the edge list will be used as taxon IDs.
+#' The whole table will be included as a data set in the output object.
+#'
+#' @param input A table containing an edge list encoded by two columns.
+#' @param taxon_id The name/index of the column containing the taxon IDs.
+#' @param supertaxon_id The name/index of the column containing the taxon IDs for the supertaxon of the IDs in `taxon_col`.
+#'
+#' @family parsers
+#'
+#' @keywords internal
+parse_edge_list <- function(input, taxon_id, supertaxon_id, taxon_name, taxon_rank = NULL) {
+
+  # Create empty taxmap object
+  output <- taxmap()
+
+  # Make taxon ID characters
+  input[taxon_id] <- as.character(input[[taxon_id]])
+  input[supertaxon_id] <- as.character(input[[supertaxon_id]])
+
+  # Add edge list
+  output$edge_list <- data.frame(from = input[[supertaxon_id]],
+                                 to = input[[taxon_id]],
+                                 stringsAsFactors = FALSE)
+
+  # Add taxa
+  output$taxa <- lapply(seq_len(nrow(input)), function(i) {
+    my_name <- input[[taxon_name]][i]
+    if (is.null(taxon_rank)) {
+      my_rank <- NULL
+    } else {
+      my_rank <- input[[taxon_rank]][i]
+    }
+    my_id <- input[[taxon_id]][i]
+    taxon(name = my_name, rank = my_rank, id = my_id)
+  })
+  names(output$taxa) <- input[[taxon_id]]
+
+  # Add data
+  input <- dplyr::mutate(input, taxon_id = taxon_ids(output))
+  input <- dplyr::select(input, taxon_id, everything())
+  output$data <- list(input = input)
+
+  return(output)
+}
diff --git a/R/primersearch.R b/R/primersearch.R
@@ -345,7 +345,7 @@ primersearch_raw <- function(input = NULL, file = NULL, forward, reverse, mismat
 #' 
 #' A pair of primers are aligned against a set of sequences. A
 #' \code{\link[taxa]{taxmap}} object with two tables is returned: a table with
-#' information for each predicited amplicon, quality of match, and predicted
+#' information for each predicted amplicon, quality of match, and predicted
 #' amplicons, and a table with per-taxon amplification statistics. Requires the
 #' EMBOSS tool kit (\url{http://emboss.sourceforge.net/}) to be installed.
 #' 
@@ -389,6 +389,8 @@ primersearch_raw <- function(input = NULL, file = NULL, forward, reverse, mismat
 #'   an external variable (i.e. not in \code{obj$data}), it must be named by
 #'   taxon IDs or have the same length as the number of taxa in \code{obj}.
 #'   Currently, only character vectors are accepted.
+#' @param clone If \code{TRUE}, make a copy of the input object and add on the results (like most R
+#'   functions). If \code{FALSE}, the input will be changed without saving the result, which uses less RAM.
 #' @inheritParams primersearch_raw
 #' 
 #' @return A copy of the input \code{\link[taxa]{taxmap}} object with two tables added. One table contains amplicon information with one row per predicted amplicon with the following info:
@@ -469,11 +471,39 @@ primersearch_raw <- function(input = NULL, file = NULL, forward, reverse, mismat
 #' 
 #' @examples
 #' \dontrun{
+#' # Get example FASTA file
+#' fasta_path <- system.file(file.path("extdata", "silva_subset.fa"),
+#'                           package = "metacoder")
+#' 
+#' # Parse the FASTA file as a taxmap object
+#' obj <- parse_silva_fasta(file = fasta_path)
 #' 
+#' # Simulate PCR with primersearch
+#' # Have to replace Us with Ts in sequences since primersearch
+#' #   does not understand Us.
+#' obj <- primersearch(obj,
+#'                     gsub(silva_seq, pattern = "U", replace = "T"), 
+#'                     forward = c("U519F" = "CAGYMGCCRCGGKAAHACC"),
+#'                     reverse = c("Arch806R" = "GGACTACNSGGGTMTCTAAT"),
+#'                     mismatch = 10)
+#'                            
+#' # Plot what did not ampilify                          
+#' obj %>%
+#'   filter_taxa(prop_amplified < 1) %>%
+#'   heat_tree(node_label = taxon_names, 
+#'             node_color = prop_amplified, 
+#'             node_color_range = c("grey", "red", "purple", "green"),
+#'             node_color_trans = "linear",
+#'             node_color_axis_label = "Proportion amplified",
+#'             node_size = n_obs,
+#'             node_size_axis_label = "Number of sequences",
+#'             layout = "da", 
+#'             initial_layout = "re")
 #' } 
 #' 
+#' @importFrom rlang .data
 #' @export
-primersearch <- function(obj, seqs, forward, reverse, mismatch = 5) {
+primersearch <- function(obj, seqs, forward, reverse, mismatch = 5, clone = TRUE) {
   # Non-standard argument evaluation
   data_used <- eval(substitute(obj$data_used(seqs)))
   sequences <- lazyeval::lazy_eval(lazyeval::lazy(seqs), data = data_used)
@@ -503,49 +533,54 @@ primersearch <- function(obj, seqs, forward, reverse, mismatch = 5) {
   }
 
   # Make copy of input object to construct output
-  output <- obj
+  if (clone) {
+    output <- obj$clone(deep = TRUE)
+  } else {
+    output <- obj
+  }
 
   # Run primer search
   if ("amplicons" %in% names(output$data)) {
     warning(call. = FALSE,
             'The existing dataset "amplicons" will be overwritten.')
   }
   output$data$amplicons <- primersearch_raw(input = sequences, forward = forward,
-                                   reverse = reverse, mismatch = mismatch) %>%
-    dplyr::mutate(taxon_id = names(sequences)[input]) %>%
-    dplyr::rename(seq_index = input) %>%
+                                            reverse = reverse, mismatch = mismatch) %>%
+    dplyr::mutate(taxon_id = names(sequences)[.data$input]) %>%
+    dplyr::rename(seq_index = .data$input) %>%
     dplyr::select(taxon_id , everything()) 
 
   # Make per-taxon table
   if ("tax_amp_stats" %in% names(output$data)) {
     warning(call. = FALSE,
             'The existing dataset "tax_amp_stats" will be overwritten.')
   }
-  output$data$tax_amp_stats <- dplyr::tibble(taxon_id = obj$taxon_ids(),
-                                             query_count = n_obs(output, sequences),
-                                             seq_count = vapply(obs(output, data = "amplicons"),
+  output$data$tax_amp_stats <- dplyr::tibble("taxon_id" = output$taxon_ids(),
+                                             "query_count" = vapply(output$obs(sequences), length, numeric(1)),
+                                             "seq_count" = vapply(output$obs("amplicons"),
                                                                     FUN.VALUE = numeric(1),
                                                                     FUN = function(i) length(unique(output$data$amplicons$seq_index[i]))),
-                                             amp_count = n_obs(output, "amplicons"),
-                                             amplified = amp_count > 0)
+                                             "amp_count" = vapply(output$obs("amplicons"), length, numeric(1)))
+  output$data$tax_amp_stats$amplified <- output$data$tax_amp_stats$amp_count > 0
 
   # Check for multiple amplicons per sequence
   amp_per_seq_data <- output$data$amplicons %>%
-    dplyr::group_by(seq_index) %>%
+    dplyr::group_by(.data$seq_index) %>%
     dplyr::count() %>%
-    dplyr::mutate(taxon_id = names(sequences)[seq_index], multiple = n > 1)
+    dplyr::mutate(taxon_id = names(sequences)[.data$seq_index],
+                  multiple = .data$n > 1)
   output$data$tax_amp_stats$multiple <- unlist(output$obs_apply(amp_per_seq_data, function(i) any(amp_per_seq_data$multiple)))
 
   # Calculate proportion amplified
-  output$mutate_obs("tax_amp_stats", prop_amplified = seq_count / query_count)
+  output$data$tax_amp_stats$prop_amplified <- output$data$tax_amp_stats$seq_count / output$data$tax_amp_stats$query_count
 
   # Calculate amplicon length stats
   output$mutate_obs("tax_amp_stats",
                     med_amp_len = unlist(output$obs_apply("amplicons", value = "amplicon", func = function(s) {
                       if (length(s) == 0) {
                         return(NA_real_)
                       } else {
-                        return(median(nchar(s)))
+                        return(stats::median(nchar(s)))
                       }
                     })),
                     min_amp_len = unlist(output$obs_apply("amplicons", value = "amplicon", func = function(s) {
@@ -566,7 +601,7 @@ primersearch <- function(obj, seqs, forward, reverse, mismatch = 5) {
                       if (length(s) == 0) {
                         return(NA_real_)
                       } else {
-                        return(median(nchar(s)))
+                        return(stats::median(nchar(s)))
                       }
                     })),
                     min_prod_len = unlist(output$obs_apply("amplicons", value = "product", func = function(s) {

diff --git a/R/writers.R b/R/writers.R
@@ -1,4 +1,4 @@
-#' Write an imitation of the Greengenes databse
+#' Write an imitation of the Greengenes database
 #' 
 #' Attempts to save taxonomic and sequence information of a taxmap object in the
 #' Greengenes output format. If the taxmap object was created using
@@ -84,7 +84,7 @@ write_greengenes <- function(obj, tax_file = NULL, seq_file = NULL,
 }
 
 
-#' Write an imitation of the RDP FASTA databse
+#' Write an imitation of the RDP FASTA database
 #' 
 #' Attempts to save taxonomic and sequence information of a taxmap object in the
 #' RDP FASTA format. If the taxmap object was created using
@@ -216,7 +216,7 @@ write_mothur_taxonomy <- function(obj, file,
 }
 
 
-#' Write an imitation of the UNITE general FASTA databse
+#' Write an imitation of the UNITE general FASTA database
 #' 
 #' Attempts to save taxonomic and sequence information of a taxmap object in the
 #' UNITE general FASTA format. If the taxmap object was created using
@@ -285,7 +285,7 @@ write_unite_general <- function(obj, file,
   writeLines(seq_content, file)
 }
 
-#' Write an imitation of the SILVA FASTA databse
+#' Write an imitation of the SILVA FASTA database
 #' 
 #' Attempts to save taxonomic and sequence information of a taxmap object in the
 #' SILVA FASTA format. If the taxmap object was created using