Skip to content

Commit 9566d9d

Browse files
committed
add beast function tree and data
1 parent 7d2ee12 commit 9566d9d

9 files changed

+408
-4
lines changed

DESCRIPTION

+7-4
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Version: 0.1.3
55
Author: Dhihram Tenrisau, Stéphane Hué
66
Maintainer: Dhihram Tenrisau <[email protected]>
77
Description: This package is used to find the cluster-based tree and metadata. The cluster-based tree is built based on the clusters in the data. The metadata contains information about the data used to build the tree.
8-
License: file LICENSE
8+
License: MIT + file LICENSE
99
Encoding: UTF-8
1010
LazyData: true
1111
Roxygen: list(markdown = TRUE)
@@ -14,12 +14,15 @@ Imports:
1414
ape,
1515
dplyr,
1616
stringr,
17-
progress
17+
progress,
18+
ggtree,
19+
treeio,
20+
tidyr
1821
Suggests:
1922
knitr,
2023
rmarkdown,
21-
ggtree,
2224
tidyverse,
23-
treeio
25+
testthat (>= 3.0.0)
2426
VignetteBuilder: knitr
2527
Year: 2024
28+
Config/testthat/edition: 3

NAMESPACE

+7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export(beastclus)
34
export(genclus)
45
importFrom(ape,extract.clade)
56
importFrom(ape,getMRCA)
@@ -11,9 +12,15 @@ importFrom(dplyr,filter)
1112
importFrom(dplyr,group_by)
1213
importFrom(dplyr,inner_join)
1314
importFrom(dplyr,mutate)
15+
importFrom(dplyr,rename)
1416
importFrom(dplyr,rowwise)
1517
importFrom(dplyr,select)
1618
importFrom(dplyr,slice)
19+
importFrom(dplyr,summarize)
1720
importFrom(dplyr,ungroup)
21+
importFrom(ggtree,geom_range)
22+
importFrom(ggtree,ggtree)
23+
importFrom(ggtree,theme_tree2)
1824
importFrom(progress,progress_bar)
1925
importFrom(stringr,str_split)
26+
importFrom(tidyr,separate_rows)

R/beastclus.R

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
#' Extract Clusters from a BEAST Phylogenetic Tree
2+
#'
3+
#' @param beast_tree A BEAST tree object of class `phylo4d` or similar.
4+
#' @param metadata A metadata dataframe containing columns `label`, `location`, and `date`.
5+
#' @param post_threshold Numeric value specifying the posterior probability threshold for including a cluster (default = 0.70).
6+
#' @param date_range Numeric value specifying the maximum allowable date range (in days) within clusters (default = 90).
7+
#' @param samearea Logical value indicating whether clusters must originate from the same geographic area (default = FALSE).
8+
#'
9+
#' @return A dataframe summarizing the clusters, including labels, posterior probabilities, areas, date ranges, and the number of tips.
10+
#' @importFrom dplyr group_by summarize ungroup filter select arrange rename mutate inner_join
11+
#' @importFrom tidyr separate_rows
12+
#' @importFrom ggtree ggtree theme_tree2 geom_range
13+
#' @importFrom progress progress_bar
14+
#' @export
15+
#' @examples
16+
#' # Example usage:
17+
#' # Assuming `beast_tree` is a BEAST tree object and `metadata` is a dataframe
18+
#' data_csv <- system.file("extdata", "metadata_samp.csv", package = "caIRA")
19+
#' metadata<-read.csv(data_csv)
20+
#' data_beast <- system.file("extdata", "pox_strict_comb.tree", package = "caIRA")
21+
#' beast_tree <- treeio::read.beast(data_beast)
22+
#' # with the required columns:
23+
#' beastclus(beast_tree, metadata, post_threshold = 0.50, date_range = 90, samearea = TRUE)
24+
beastclus <- function(beast_tree, metadata, post_threshold = 0.70, date_range = 90, samearea = FALSE) {
25+
library(treeio)
26+
library(ape)
27+
library(ggtree)
28+
library(dplyr)
29+
library(tidyr)
30+
31+
# Calculate the total number of iterations for progress tracking
32+
tip_count <- length(beast_tree@phylo$tip.label)
33+
total_iterations <- (tip_count * (tip_count - 1)) / 2
34+
35+
# Set up the progress bar
36+
pb <- progress_bar$new(
37+
format = "(:spin) [:bar] :percent [Elapsed time: :elapsedfull || Estimated time remaining: :eta]",
38+
total = total_iterations,
39+
complete = "=", # Completion bar character
40+
incomplete = "-", # Incomplete bar character
41+
current = ">", # Current bar character
42+
clear = FALSE, # If TRUE, clears the bar when finish
43+
width = 100 # Width of the progress bar
44+
)
45+
# Number of tips in the tree (leaf nodes)
46+
tree <- beast_tree@phylo
47+
num_tips <- length(tree$tip.label)
48+
49+
# Extract internal node numbers using the edge matrix (parent-child relationships)
50+
internal_nodes <- unique(tree$edge[, 1][tree$edge[, 1] > num_tips])
51+
52+
# Create an empty list to store clades and their leaf nodes
53+
clade_leaf_nodes <- list()
54+
55+
# Loop through each internal node and extract the clade
56+
for (i in internal_nodes) {
57+
# Extract the clade rooted at the current internal node
58+
clade <- extract.clade(tree, node = i)
59+
60+
# Get the leaf (tip) names for the clade
61+
leaf_nodes <- clade$tip.label
62+
63+
# Store the clade and its leaf nodes in the list
64+
clade_leaf_nodes[[as.character(i)]] <- leaf_nodes # Assign node number as the name
65+
}
66+
67+
# Convert the list to a data frame for easier visualization
68+
clade_df <- data.frame(
69+
nodes = rep(names(clade_leaf_nodes), sapply(clade_leaf_nodes, length)),
70+
label = unlist(clade_leaf_nodes)
71+
)
72+
73+
# Merging posterior probability from tree data
74+
tre <- ggtree(beast_tree, mrsd = min(metadata$date)) +
75+
theme_tree2() + geom_range(range = 'length_0.95_HPD', color = 'red', alpha = .6, size = 2)
76+
post_dat <- tre$data
77+
post_dat <- select(post_dat, node, posterior, x)
78+
79+
clade_df <- merge(clade_df, post_dat, by.x = "nodes", by.y = "node", all.x = TRUE)
80+
81+
# Merging metadata
82+
metadata_2 <- select(metadata, label, location, date)
83+
clade_df <- merge(clade_df, metadata_2, by.x = "label", by.y = "label", all.x = TRUE)
84+
85+
# Summarize data
86+
summary_df <- clade_df %>%
87+
group_by(nodes) %>%
88+
summarize(
89+
label = paste(label, collapse = ", "), # Merge all labels for the group
90+
Posterior = round(mean(posterior, na.rm = TRUE), 2), # Use mean posterior (or adjust as needed)
91+
AreaName = ifelse(length(unique(location)) > 1, # Check the number of unique locations
92+
paste(unique(location), collapse = ", "),
93+
unique(location)), # Use single location if only one
94+
min_date = min(as.Date(date)), # Minimum date in the group
95+
max_date = max(as.Date(date)), # Maximum date in the group
96+
NumTips = n(), # Count of label entries
97+
.groups = "drop" # Avoid grouping in the output
98+
)
99+
100+
summary_df$dif <- as.numeric(summary_df$max_date - summary_df$min_date)
101+
summary_df = rename(summary_df, c('ParentNode' = 'nodes'))
102+
103+
# Filtering the data based on the posterior and date range criteria
104+
filtered_df <- summary_df %>%
105+
filter(Posterior > post_threshold,
106+
dif <= date_range,
107+
(samearea == TRUE & !grepl(",", AreaName)) | samearea == FALSE)
108+
109+
# Separate the label into different rows based on ',' separator
110+
filtered_df <- filtered_df %>%
111+
separate_rows(label, sep = ", ") # Use ", " as separator to split the label
112+
113+
# Remove duplicate labels, keeping the one with the maximum NumTips
114+
filtered_df <- filtered_df %>%
115+
group_by(label) %>%
116+
filter(NumTips == max(NumTips)) %>%
117+
ungroup()
118+
119+
# Merge the `label` values with a comma separator
120+
final_df <- filtered_df %>%
121+
group_by(ParentNode) %>%
122+
summarize(
123+
label = paste(label, collapse = ", "), # Merge labels with a comma separator
124+
Posterior = first(Posterior), # Keep the first value of `Posterior`
125+
AreaName = first(AreaName), # Keep the first value of `AreaName`
126+
min_date = min(min_date), # Get the minimum date
127+
max_date = max(max_date), # Get the maximum date
128+
NumTips = first(NumTips), # Sum NumTips (if needed)
129+
dif = max(dif), # Get the maximum difference
130+
.groups = "drop" # Remove the grouping after summarizing
131+
)
132+
133+
# Convert ParentNode to numeric
134+
final_df$ParentNode <- as.numeric(final_df$ParentNode)
135+
136+
#order date
137+
final_df <- final_df[order(final_df$min_date),]
138+
139+
return(final_df)
140+
}
141+
142+
utils::globalVariables(c("node", "posterior", "label", "location", "n"))

0 commit comments

Comments
 (0)