Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Relative Homoplasy Index #158

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: TreeSearch
Title: Phylogenetic Analysis with Discrete Character Data
Version: 1.5.1
Version: 1.5.1.9000
Authors@R: c(
person(
"Martin R.", 'Smith',
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export(EasyTreesy)
export(EdgeListSearch)
export(EmptyPhyDat)
export(Evaluate)
export(ExpectedLength)
export(Fitch)
export(FitchSteps)
export(GapHandler)
Expand Down Expand Up @@ -202,6 +203,7 @@ importFrom(fastmatch,fmatch)
importFrom(fs,path_sanitize)
importFrom(future,future)
importFrom(graphics,par)
importFrom(memoise,memoise)
importFrom(promises,future_promise)
importFrom(protoclust,protoclust)
importFrom(shiny,runApp)
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# TreeSearch 1.5.1.9000 (2024-05-31, development)

- `Consistency()` also returns the relative homoplasy index of Steell et al.
2023.


# TreeSearch 1.5.1 (2024-05-23)

- Fix calls to `DescendantEdges()`
Expand Down
112 changes: 107 additions & 5 deletions R/Consistency.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,27 +32,36 @@
#' possible values runs from zero (least consistent) to one
#' (perfectly consistent).
#'
#' The **relative homoplasy index** \insertCite{Steell2023}{TreeSearch} is
#' the ratio of the observed excess tree length to the excess tree length
#' due to chance, taken as the median score of a character when the leaves
#' of the given tree are randomly shuffled.
#'
#' The lengths of characters including inapplicable tokens are calculated
#' following \insertCite{Brazeau2019;textual}{TreeSearch}, matching their
#' default treatment in [`TreeLength()`].
#'
#' @template datasetParam
#' @template treeParam
#' @param nRelabel Integer specifying number of leaf relabellings to employ when
#' calculating null tree length. If zero, the RHI will not be calculated.
#'
#' @template compressParam
#'
#' @return `Consistency()` returns a matrix with named columns specifying the
#' consistency index (`ci`),
#' retention index (`ri`), and
#' rescaled consistency index (`rc`).
#' retention index (`ri`),
#' rescaled consistency index (`rc`) and
#' relative homoplasy index (`rhi`).
#'
#' @examples
#' data(inapplicable.datasets)
#' dataset <- inapplicable.phyData[[4]]
#' head(Consistency(dataset, TreeTools::NJTree(dataset)))
#' head(Consistency(dataset, TreeTools::NJTree(dataset), nRelabel = 10))
#' @references \insertAllCited{}
#' @template MRS
#' @export
Consistency <- function (dataset, tree, compress = FALSE) {
Consistency <- function (dataset, tree, nRelabel = 1000, compress = FALSE) {
dsTips <- TipLabels(dataset)
trTips <- TipLabels(tree)
if (!setequal(dsTips, trTips)) {
Expand All @@ -66,6 +75,7 @@ Consistency <- function (dataset, tree, compress = FALSE) {
minLength <- MinimumLength(dataset, compress = TRUE) # Farris's m
maxLength <- MaximumLength(dataset, compress = TRUE) # Farris's g
obsLength <- CharacterLength(tree, dataset, compress = TRUE) # farris's s

extra <- obsLength - minLength # Farris's h
maxHomoplasy <- (maxLength - minLength) # g - m

Expand All @@ -76,7 +86,15 @@ Consistency <- function (dataset, tree, compress = FALSE) {

rc <- ri * minLength / obsLength

ret <- cbind(ci = ci, ri = ri, rc = rc)
if (nRelabel > 0) {
medLength <- ExpectedLength(dataset, tree, nRelabel, compress = TRUE)
expHomoplasy <- medLength - minLength
rhi <- extra / expHomoplasy
} else {
rhi <- NA
}

ret <- cbind(ci = ci, ri = ri, rc = rc, rhi = rhi)

# Return:
if (compress) {
Expand All @@ -85,3 +103,87 @@ Consistency <- function (dataset, tree, compress = FALSE) {
ret[attr(dataset, "index"), ]
}
}


#' Expected length
#'
#' For a given dataset and tree topology, `ExpectedLength()` estimates the
#' length expected if the states of each character are shuffled randomly
#' across the leaves.
#'
#' @references \insertAllCited{}
#' @inheritParams Consistency
#'
#' @return `ExpectedLength()` returns a numeric vector stating the median
#' length of each character in `dataset` on `tree` after `nRelabel` random
#' relabelling of leaves.
#'
#' @importFrom memoise memoise
#' @export
#' @template MRS
ExpectedLength <- function(dataset, tree, nRelabel = 1000, compress = FALSE) {
mat <- do.call(rbind, dataset)
at <- attributes(dataset)
contrast <- at[["contrast"]]

rewritten <- apply(mat, 2, .SortChar,
cont = apply(contrast, 1, .Bin),
inapp = match("-", at[["levels"]], nomatch = 0))

.LengthForChar <- memoise(function(x) {
patterns <- apply(unname(unique(t(
as.data.frame(replicate(nRelabel, sample(rep(seq_along(x), x))))))),
2, I, simplify = FALSE)
nr <- length(patterns[[1]])
phy <- structure(
setNames(patterns, TipLabels(tree)),
"weight" = rep(1, nr),
nr = nr,
nc = dim(contrast)[[2]],
index = seq_len(nr),
levels = at[["levels"]],
allLevels = at[["allLevels"]],
type = "USER",
contrast = contrast,
class = "phyDat")
median(CharacterLength(tree, phy))
})

exp <- apply(apply(rewritten, 2, tabulate, max(rewritten)), 2, .LengthForChar)

# Return:
if (compress) {
exp
} else {
exp[at[["index"]]]
}
}


.Bin <- function(x) {
sum(2 ^ (seq_along(x)[as.logical(x)] - 1))
}

.SortChar <- function(char, cont, inapp) {
logCont <- log2(cont)
nWhole <- 2 ^ floor(max(logCont))
maxN <- nWhole + nWhole - 1
ambig <- logCont != floor(logCont)
swappableTokens <- cont[cont != ifelse(is.na(inapp), 0, inapp) & !ambig]
tab <- tabulate(char, maxN)
mapping <- integer(maxN)
if (!is.na(inapp) && inapp > 0) {
mapping[[inapp]] <- inapp
}
mapping[swappableTokens[order(tab[swappableTokens], decreasing = TRUE)]] <-
sort.int(swappableTokens)

nAssigned <- log2(nWhole) + 1
wholes <- c(mapping[2 ^ (seq_len(nAssigned) - 1)], integer(32 - nAssigned))
mapping[-swappableTokens] <- apply(matrix(as.logical(intToBits(
seq_len(maxN)[-swappableTokens])), 32), 2,
function(x) sum(wholes[x]))

# Return:
mapping[char]
}
9 changes: 9 additions & 0 deletions inst/REFERENCES.bib
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,15 @@ @article{Steel1996
number = {4}
}

@article{Steell2023,
title = {Relative {{Homoplasy Index}}: A New Cross-Comparable Metric for Quantifying Homoplasy in Discrete Character Datasets},
author = {Steell, Elizabeth M. and Hsiang, Allison Y. and Field, Daniel J.},
year = {2023},
doi = {10.1101/2023.10.10.561677},
journal = {BioR&chi;iv}
}


@article{Stockham2002,
title = {Statistically Based Postprocessing of Phylogenetic Analysis by Clustering},
author = {Stockham, C. and Wang, L.-S. and Warnow, T.},
Expand Down
1 change: 1 addition & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ Scripta
Siboglinidae
SmithTern
Squamata
Steell
Syllidae
Syst
TANKE
Expand Down
17 changes: 13 additions & 4 deletions man/Consistency.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 38 additions & 0 deletions man/ExpectedLength.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions tests/testthat/test-Consistency.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,34 @@ test_that("Consistency() handles `-`", {
c(ci = m / s, ri = r, rc = r * m / s))
)
})

test_that(".SortChar() works", {
contrast <- structure(c(0, 0, 1, 1, 0, 0, 0,
1, 0, 1, 0, 0, 0, 0,
0, 1, 1, 0, 0, 0, 0,
0, 0, 1, 0, 1, 0, 0,
0, 0, 1, 0, 0, 1, 0,
0, 0, 1, 0, 0, 0, 1), dim = 7:6,
dimnames = list(NULL, c("-", "0", "1", "2", "3", "4")))
cont <- apply(contrast, 1, .Bin)
# Simplest
expect_equal(.SortChar(rep(1:2, 5:4), 1:2, NA), rep(1:2, 5:4))
expect_equal(.SortChar(rep(1:2, 4:5), 1:2, NA), rep(2:1, 4:5))
expect_equal(.SortChar(rep(1:3, 4:6), 1:2, NA), rep(c(2, 1, 3), 4:6))

# Straightforward, no inapp
expect_equal(.SortChar(rep(2 ^ c(1, 2, 4), c(4, 2, 3)), cont, inapp = NA),
rep(2 ^ c(0, 2, 1), c(4, 2, 3)))

# Straightforward
expect_equal(.SortChar(rep(2^c(1, 2, 4), c(4, 2, 3)), cont, inapp = 1),
rep(2 ^ c(1, 3, 2), c(4, 2, 3)))

# Inapplicable
expect_equal(.SortChar(rep(2^c(1, 2, 0, 4), c(4, 2, 3, 3)), cont, inapp = 1),
rep(2^c(1, 3, 0, 2), c(4, 2, 3, 3)))

# Ambiguity: 8->2; 2->4; 1->1
expect_equal(.SortChar(c(8, 8, 8 + 2, 8 + 1, 8, 2 + 1, 2, 2, 1, 1, 1), cont, inapp = 1),
c(2, 2, 2 + 4, 2 + 1, 2, 4 + 1, 4, 4, 1, 1, 1))
})
Loading