Skip to content

Commit

Permalink
add beast function tree and data
Browse files Browse the repository at this point in the history
  • Loading branch information
Dhihram committed Nov 27, 2024
1 parent 7d2ee12 commit 9566d9d
Show file tree
Hide file tree
Showing 9 changed files with 408 additions and 4 deletions.
11 changes: 7 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Version: 0.1.3
Author: Dhihram Tenrisau, Stéphane Hué
Maintainer: Dhihram Tenrisau <[email protected]>
Description: This package is used to find the cluster-based tree and metadata. The cluster-based tree is built based on the clusters in the data. The metadata contains information about the data used to build the tree.
License: file LICENSE
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
Expand All @@ -14,12 +14,15 @@ Imports:
ape,
dplyr,
stringr,
progress
progress,
ggtree,
treeio,
tidyr
Suggests:
knitr,
rmarkdown,
ggtree,
tidyverse,
treeio
testthat (>= 3.0.0)
VignetteBuilder: knitr
Year: 2024
Config/testthat/edition: 3
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(beastclus)
export(genclus)
importFrom(ape,extract.clade)
importFrom(ape,getMRCA)
Expand All @@ -11,9 +12,15 @@ importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,inner_join)
importFrom(dplyr,mutate)
importFrom(dplyr,rename)
importFrom(dplyr,rowwise)
importFrom(dplyr,select)
importFrom(dplyr,slice)
importFrom(dplyr,summarize)
importFrom(dplyr,ungroup)
importFrom(ggtree,geom_range)
importFrom(ggtree,ggtree)
importFrom(ggtree,theme_tree2)
importFrom(progress,progress_bar)
importFrom(stringr,str_split)
importFrom(tidyr,separate_rows)
142 changes: 142 additions & 0 deletions R/beastclus.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#' Extract Clusters from a BEAST Phylogenetic Tree
#'
#' @param beast_tree A BEAST tree object of class `phylo4d` or similar.
#' @param metadata A metadata dataframe containing columns `label`, `location`, and `date`.
#' @param post_threshold Numeric value specifying the posterior probability threshold for including a cluster (default = 0.70).
#' @param date_range Numeric value specifying the maximum allowable date range (in days) within clusters (default = 90).
#' @param samearea Logical value indicating whether clusters must originate from the same geographic area (default = FALSE).
#'
#' @return A dataframe summarizing the clusters, including labels, posterior probabilities, areas, date ranges, and the number of tips.
#' @importFrom dplyr group_by summarize ungroup filter select arrange rename mutate inner_join
#' @importFrom tidyr separate_rows
#' @importFrom ggtree ggtree theme_tree2 geom_range
#' @importFrom progress progress_bar
#' @export
#' @examples
#' # Example usage:
#' # Assuming `beast_tree` is a BEAST tree object and `metadata` is a dataframe
#' data_csv <- system.file("extdata", "metadata_samp.csv", package = "caIRA")
#' metadata<-read.csv(data_csv)
#' data_beast <- system.file("extdata", "pox_strict_comb.tree", package = "caIRA")
#' beast_tree <- treeio::read.beast(data_beast)
#' # with the required columns:
#' beastclus(beast_tree, metadata, post_threshold = 0.50, date_range = 90, samearea = TRUE)
beastclus <- function(beast_tree, metadata, post_threshold = 0.70, date_range = 90, samearea = FALSE) {
library(treeio)
library(ape)
library(ggtree)
library(dplyr)
library(tidyr)

# Calculate the total number of iterations for progress tracking
tip_count <- length(beast_tree@phylo$tip.label)
total_iterations <- (tip_count * (tip_count - 1)) / 2

# Set up the progress bar
pb <- progress_bar$new(
format = "(:spin) [:bar] :percent [Elapsed time: :elapsedfull || Estimated time remaining: :eta]",
total = total_iterations,
complete = "=", # Completion bar character
incomplete = "-", # Incomplete bar character
current = ">", # Current bar character
clear = FALSE, # If TRUE, clears the bar when finish
width = 100 # Width of the progress bar
)
# Number of tips in the tree (leaf nodes)
tree <- beast_tree@phylo
num_tips <- length(tree$tip.label)

# Extract internal node numbers using the edge matrix (parent-child relationships)
internal_nodes <- unique(tree$edge[, 1][tree$edge[, 1] > num_tips])

# Create an empty list to store clades and their leaf nodes
clade_leaf_nodes <- list()

# Loop through each internal node and extract the clade
for (i in internal_nodes) {
# Extract the clade rooted at the current internal node
clade <- extract.clade(tree, node = i)

# Get the leaf (tip) names for the clade
leaf_nodes <- clade$tip.label

# Store the clade and its leaf nodes in the list
clade_leaf_nodes[[as.character(i)]] <- leaf_nodes # Assign node number as the name
}

# Convert the list to a data frame for easier visualization
clade_df <- data.frame(
nodes = rep(names(clade_leaf_nodes), sapply(clade_leaf_nodes, length)),
label = unlist(clade_leaf_nodes)
)

# Merging posterior probability from tree data
tre <- ggtree(beast_tree, mrsd = min(metadata$date)) +
theme_tree2() + geom_range(range = 'length_0.95_HPD', color = 'red', alpha = .6, size = 2)
post_dat <- tre$data
post_dat <- select(post_dat, node, posterior, x)

clade_df <- merge(clade_df, post_dat, by.x = "nodes", by.y = "node", all.x = TRUE)

# Merging metadata
metadata_2 <- select(metadata, label, location, date)
clade_df <- merge(clade_df, metadata_2, by.x = "label", by.y = "label", all.x = TRUE)

# Summarize data
summary_df <- clade_df %>%
group_by(nodes) %>%
summarize(
label = paste(label, collapse = ", "), # Merge all labels for the group
Posterior = round(mean(posterior, na.rm = TRUE), 2), # Use mean posterior (or adjust as needed)
AreaName = ifelse(length(unique(location)) > 1, # Check the number of unique locations
paste(unique(location), collapse = ", "),
unique(location)), # Use single location if only one
min_date = min(as.Date(date)), # Minimum date in the group
max_date = max(as.Date(date)), # Maximum date in the group
NumTips = n(), # Count of label entries
.groups = "drop" # Avoid grouping in the output
)

summary_df$dif <- as.numeric(summary_df$max_date - summary_df$min_date)
summary_df = rename(summary_df, c('ParentNode' = 'nodes'))

# Filtering the data based on the posterior and date range criteria
filtered_df <- summary_df %>%
filter(Posterior > post_threshold,
dif <= date_range,
(samearea == TRUE & !grepl(",", AreaName)) | samearea == FALSE)

# Separate the label into different rows based on ',' separator
filtered_df <- filtered_df %>%
separate_rows(label, sep = ", ") # Use ", " as separator to split the label

# Remove duplicate labels, keeping the one with the maximum NumTips
filtered_df <- filtered_df %>%
group_by(label) %>%
filter(NumTips == max(NumTips)) %>%
ungroup()

# Merge the `label` values with a comma separator
final_df <- filtered_df %>%
group_by(ParentNode) %>%
summarize(
label = paste(label, collapse = ", "), # Merge labels with a comma separator
Posterior = first(Posterior), # Keep the first value of `Posterior`
AreaName = first(AreaName), # Keep the first value of `AreaName`
min_date = min(min_date), # Get the minimum date
max_date = max(max_date), # Get the maximum date
NumTips = first(NumTips), # Sum NumTips (if needed)
dif = max(dif), # Get the maximum difference
.groups = "drop" # Remove the grouping after summarizing
)

# Convert ParentNode to numeric
final_df$ParentNode <- as.numeric(final_df$ParentNode)

#order date
final_df <- final_df[order(final_df$min_date),]

return(final_df)
}

utils::globalVariables(c("node", "posterior", "label", "location", "n"))
Loading

0 comments on commit 9566d9d

Please sign in to comment.