diff --git a/DESCRIPTION b/DESCRIPTION index 7877fd1dc..3c817547c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.5.1 +Version: 1.5.1.9000 Authors@R: c( person( "Martin R.", 'Smith', diff --git a/NAMESPACE b/NAMESPACE index 98ce38262..d896083c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ export(EasyTreesy) export(EdgeListSearch) export(EmptyPhyDat) export(Evaluate) +export(ExpectedLength) export(Fitch) export(FitchSteps) export(GapHandler) @@ -202,6 +203,7 @@ importFrom(fastmatch,fmatch) importFrom(fs,path_sanitize) importFrom(future,future) importFrom(graphics,par) +importFrom(memoise,memoise) importFrom(promises,future_promise) importFrom(protoclust,protoclust) importFrom(shiny,runApp) diff --git a/NEWS.md b/NEWS.md index 01a62acf5..b51712ee4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# TreeSearch 1.5.1.9000 (2024-05-31, development) + +- `Consistency()` also returns the relative homoplasy index of Steell et al. + 2023. + + # TreeSearch 1.5.1 (2024-05-23) - Fix calls to `DescendantEdges()` diff --git a/R/Consistency.R b/R/Consistency.R index cd8c3813f..866271c0c 100644 --- a/R/Consistency.R +++ b/R/Consistency.R @@ -32,27 +32,36 @@ #' possible values runs from zero (least consistent) to one #' (perfectly consistent). #' +#' The **relative homoplasy index** \insertCite{Steell2023}{TreeSearch} is +#' the ratio of the observed excess tree length to the excess tree length +#' due to chance, taken as the median score of a character when the leaves +#' of the given tree are randomly shuffled. +#' #' The lengths of characters including inapplicable tokens are calculated #' following \insertCite{Brazeau2019;textual}{TreeSearch}, matching their #' default treatment in [`TreeLength()`]. #' #' @template datasetParam #' @template treeParam +#' @param nRelabel Integer specifying number of leaf relabellings to employ when +#' calculating null tree length. If zero, the RHI will not be calculated. +#' #' @template compressParam #' #' @return `Consistency()` returns a matrix with named columns specifying the #' consistency index (`ci`), -#' retention index (`ri`), and -#' rescaled consistency index (`rc`). +#' retention index (`ri`), +#' rescaled consistency index (`rc`) and +#' relative homoplasy index (`rhi`). #' #' @examples #' data(inapplicable.datasets) #' dataset <- inapplicable.phyData[[4]] -#' head(Consistency(dataset, TreeTools::NJTree(dataset))) +#' head(Consistency(dataset, TreeTools::NJTree(dataset), nRelabel = 10)) #' @references \insertAllCited{} #' @template MRS #' @export -Consistency <- function (dataset, tree, compress = FALSE) { +Consistency <- function (dataset, tree, nRelabel = 1000, compress = FALSE) { dsTips <- TipLabels(dataset) trTips <- TipLabels(tree) if (!setequal(dsTips, trTips)) { @@ -66,6 +75,7 @@ Consistency <- function (dataset, tree, compress = FALSE) { minLength <- MinimumLength(dataset, compress = TRUE) # Farris's m maxLength <- MaximumLength(dataset, compress = TRUE) # Farris's g obsLength <- CharacterLength(tree, dataset, compress = TRUE) # farris's s + extra <- obsLength - minLength # Farris's h maxHomoplasy <- (maxLength - minLength) # g - m @@ -76,7 +86,15 @@ Consistency <- function (dataset, tree, compress = FALSE) { rc <- ri * minLength / obsLength - ret <- cbind(ci = ci, ri = ri, rc = rc) + if (nRelabel > 0) { + medLength <- ExpectedLength(dataset, tree, nRelabel, compress = TRUE) + expHomoplasy <- medLength - minLength + rhi <- extra / expHomoplasy + } else { + rhi <- NA + } + + ret <- cbind(ci = ci, ri = ri, rc = rc, rhi = rhi) # Return: if (compress) { @@ -85,3 +103,87 @@ Consistency <- function (dataset, tree, compress = FALSE) { ret[attr(dataset, "index"), ] } } + + +#' Expected length +#' +#' For a given dataset and tree topology, `ExpectedLength()` estimates the +#' length expected if the states of each character are shuffled randomly +#' across the leaves. +#' +#' @references \insertAllCited{} +#' @inheritParams Consistency +#' +#' @return `ExpectedLength()` returns a numeric vector stating the median +#' length of each character in `dataset` on `tree` after `nRelabel` random +#' relabelling of leaves. +#' +#' @importFrom memoise memoise +#' @export +#' @template MRS +ExpectedLength <- function(dataset, tree, nRelabel = 1000, compress = FALSE) { + mat <- do.call(rbind, dataset) + at <- attributes(dataset) + contrast <- at[["contrast"]] + + rewritten <- apply(mat, 2, .SortChar, + cont = apply(contrast, 1, .Bin), + inapp = match("-", at[["levels"]], nomatch = 0)) + + .LengthForChar <- memoise(function(x) { + patterns <- apply(unname(unique(t( + as.data.frame(replicate(nRelabel, sample(rep(seq_along(x), x))))))), + 2, I, simplify = FALSE) + nr <- length(patterns[[1]]) + phy <- structure( + setNames(patterns, TipLabels(tree)), + "weight" = rep(1, nr), + nr = nr, + nc = dim(contrast)[[2]], + index = seq_len(nr), + levels = at[["levels"]], + allLevels = at[["allLevels"]], + type = "USER", + contrast = contrast, + class = "phyDat") + median(CharacterLength(tree, phy)) + }) + + exp <- apply(apply(rewritten, 2, tabulate, max(rewritten)), 2, .LengthForChar) + + # Return: + if (compress) { + exp + } else { + exp[at[["index"]]] + } +} + + +.Bin <- function(x) { + sum(2 ^ (seq_along(x)[as.logical(x)] - 1)) +} + +.SortChar <- function(char, cont, inapp) { + logCont <- log2(cont) + nWhole <- 2 ^ floor(max(logCont)) + maxN <- nWhole + nWhole - 1 + ambig <- logCont != floor(logCont) + swappableTokens <- cont[cont != ifelse(is.na(inapp), 0, inapp) & !ambig] + tab <- tabulate(char, maxN) + mapping <- integer(maxN) + if (!is.na(inapp) && inapp > 0) { + mapping[[inapp]] <- inapp + } + mapping[swappableTokens[order(tab[swappableTokens], decreasing = TRUE)]] <- + sort.int(swappableTokens) + + nAssigned <- log2(nWhole) + 1 + wholes <- c(mapping[2 ^ (seq_len(nAssigned) - 1)], integer(32 - nAssigned)) + mapping[-swappableTokens] <- apply(matrix(as.logical(intToBits( + seq_len(maxN)[-swappableTokens])), 32), 2, + function(x) sum(wholes[x])) + + # Return: + mapping[char] +} diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 130091ffc..69947585c 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -531,6 +531,15 @@ @article{Steel1996 number = {4} } +@article{Steell2023, + title = {Relative {{Homoplasy Index}}: A New Cross-Comparable Metric for Quantifying Homoplasy in Discrete Character Datasets}, + author = {Steell, Elizabeth M. and Hsiang, Allison Y. and Field, Daniel J.}, + year = {2023}, + doi = {10.1101/2023.10.10.561677}, + journal = {BioRχiv} +} + + @article{Stockham2002, title = {Statistically Based Postprocessing of Phylogenetic Analysis by Clustering}, author = {Stockham, C. and Wang, L.-S. and Warnow, T.}, diff --git a/inst/WORDLIST b/inst/WORDLIST index d9966f613..698107dd4 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -142,6 +142,7 @@ Scripta Siboglinidae SmithTern Squamata +Steell Syllidae Syst TANKE diff --git a/man/Consistency.Rd b/man/Consistency.Rd index 3bd55a753..b290d4791 100644 --- a/man/Consistency.Rd +++ b/man/Consistency.Rd @@ -4,7 +4,7 @@ \alias{Consistency} \title{Consistency / retention "indices"} \usage{ -Consistency(dataset, tree, compress = FALSE) +Consistency(dataset, tree, nRelabel = 1000, compress = FALSE) } \arguments{ \item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class @@ -12,6 +12,9 @@ Consistency(dataset, tree, compress = FALSE) \item{tree}{A tree of class \code{\link{phylo}}.} +\item{nRelabel}{Integer specifying number of leaf relabellings to employ when +calculating null tree length. If zero, the RHI will not be calculated.} + \item{compress}{Logical specifying whether to retain the compression of a \code{phyDat} object or to return a vector specifying to each individual character, decompressed using the dataset's \code{index} attribute.} @@ -19,8 +22,9 @@ character, decompressed using the dataset's \code{index} attribute.} \value{ \code{Consistency()} returns a matrix with named columns specifying the consistency index (\code{ci}), -retention index (\code{ri}), and -rescaled consistency index (\code{rc}). +retention index (\code{ri}), +rescaled consistency index (\code{rc}) and +relative homoplasy index (\code{rhi}). } \description{ \code{Consistency()} calculates the consistency "index" and retention index @@ -56,6 +60,11 @@ retention indices; it rescales the consistency index such that its range of possible values runs from zero (least consistent) to one (perfectly consistent). +The \strong{relative homoplasy index} \insertCite{Steell2023}{TreeSearch} is +the ratio of the observed excess tree length to the excess tree length +due to chance, taken as the median score of a character when the leaves +of the given tree are randomly shuffled. + The lengths of characters including inapplicable tokens are calculated following \insertCite{Brazeau2019;textual}{TreeSearch}, matching their default treatment in \code{\link[=TreeLength]{TreeLength()}}. @@ -63,7 +72,7 @@ default treatment in \code{\link[=TreeLength]{TreeLength()}}. \examples{ data(inapplicable.datasets) dataset <- inapplicable.phyData[[4]] -head(Consistency(dataset, TreeTools::NJTree(dataset))) +head(Consistency(dataset, TreeTools::NJTree(dataset), nRelabel = 10)) } \references{ \insertAllCited{} diff --git a/man/ExpectedLength.Rd b/man/ExpectedLength.Rd new file mode 100644 index 000000000..16cf79f09 --- /dev/null +++ b/man/ExpectedLength.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Consistency.R +\name{ExpectedLength} +\alias{ExpectedLength} +\title{Expected length} +\usage{ +ExpectedLength(dataset, tree, nRelabel = 1000, compress = FALSE) +} +\arguments{ +\item{dataset}{A phylogenetic data matrix of \pkg{phangorn} class +\code{phyDat}, whose names correspond to the labels of any accompanying tree.} + +\item{tree}{A tree of class \code{\link{phylo}}.} + +\item{nRelabel}{Integer specifying number of leaf relabellings to employ when +calculating null tree length. If zero, the RHI will not be calculated.} + +\item{compress}{Logical specifying whether to retain the compression of a +\code{phyDat} object or to return a vector specifying to each individual +character, decompressed using the dataset's \code{index} attribute.} +} +\value{ +\code{ExpectedLength()} returns a numeric vector stating the median +length of each character in \code{dataset} on \code{tree} after \code{nRelabel} random +relabelling of leaves. +} +\description{ +For a given dataset and tree topology, \code{ExpectedLength()} estimates the +length expected if the states of each character are shuffled randomly +across the leaves. +} +\references{ +\insertAllCited{} +} +\author{ +\href{https://smithlabdurham.github.io/}{Martin R. Smith} +(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}) +} diff --git a/tests/testthat/test-Consistency.R b/tests/testthat/test-Consistency.R index 885c7811c..25a1c884a 100644 --- a/tests/testthat/test-Consistency.R +++ b/tests/testthat/test-Consistency.R @@ -56,3 +56,34 @@ test_that("Consistency() handles `-`", { c(ci = m / s, ri = r, rc = r * m / s)) ) }) + +test_that(".SortChar() works", { + contrast <- structure(c(0, 0, 1, 1, 0, 0, 0, + 1, 0, 1, 0, 0, 0, 0, + 0, 1, 1, 0, 0, 0, 0, + 0, 0, 1, 0, 1, 0, 0, + 0, 0, 1, 0, 0, 1, 0, + 0, 0, 1, 0, 0, 0, 1), dim = 7:6, + dimnames = list(NULL, c("-", "0", "1", "2", "3", "4"))) + cont <- apply(contrast, 1, .Bin) + # Simplest + expect_equal(.SortChar(rep(1:2, 5:4), 1:2, NA), rep(1:2, 5:4)) + expect_equal(.SortChar(rep(1:2, 4:5), 1:2, NA), rep(2:1, 4:5)) + expect_equal(.SortChar(rep(1:3, 4:6), 1:2, NA), rep(c(2, 1, 3), 4:6)) + + # Straightforward, no inapp + expect_equal(.SortChar(rep(2 ^ c(1, 2, 4), c(4, 2, 3)), cont, inapp = NA), + rep(2 ^ c(0, 2, 1), c(4, 2, 3))) + + # Straightforward + expect_equal(.SortChar(rep(2^c(1, 2, 4), c(4, 2, 3)), cont, inapp = 1), + rep(2 ^ c(1, 3, 2), c(4, 2, 3))) + + # Inapplicable + expect_equal(.SortChar(rep(2^c(1, 2, 0, 4), c(4, 2, 3, 3)), cont, inapp = 1), + rep(2^c(1, 3, 0, 2), c(4, 2, 3, 3))) + + # Ambiguity: 8->2; 2->4; 1->1 + expect_equal(.SortChar(c(8, 8, 8 + 2, 8 + 1, 8, 2 + 1, 2, 2, 1, 1, 1), cont, inapp = 1), + c(2, 2, 2 + 4, 2 + 1, 2, 4 + 1, 4, 4, 1, 1, 1)) +})