|
| 1 | +#' Extract Clusters from a BEAST Phylogenetic Tree |
| 2 | +#' |
| 3 | +#' @param beast_tree A BEAST tree object of class `phylo4d` or similar. |
| 4 | +#' @param metadata A metadata dataframe containing columns `label`, `location`, and `date`. |
| 5 | +#' @param post_threshold Numeric value specifying the posterior probability threshold for including a cluster (default = 0.70). |
| 6 | +#' @param date_range Numeric value specifying the maximum allowable date range (in days) within clusters (default = 90). |
| 7 | +#' @param samearea Logical value indicating whether clusters must originate from the same geographic area (default = FALSE). |
| 8 | +#' |
| 9 | +#' @return A dataframe summarizing the clusters, including labels, posterior probabilities, areas, date ranges, and the number of tips. |
| 10 | +#' @importFrom dplyr group_by summarize ungroup filter select arrange rename mutate inner_join |
| 11 | +#' @importFrom tidyr separate_rows |
| 12 | +#' @importFrom ggtree ggtree theme_tree2 geom_range |
| 13 | +#' @importFrom progress progress_bar |
| 14 | +#' @export |
| 15 | +#' @examples |
| 16 | +#' # Example usage: |
| 17 | +#' # Assuming `beast_tree` is a BEAST tree object and `metadata` is a dataframe |
| 18 | +#' data_csv <- system.file("extdata", "metadata_samp.csv", package = "caIRA") |
| 19 | +#' metadata<-read.csv(data_csv) |
| 20 | +#' data_beast <- system.file("extdata", "pox_strict_comb.tree", package = "caIRA") |
| 21 | +#' beast_tree <- treeio::read.beast(data_beast) |
| 22 | +#' # with the required columns: |
| 23 | +#' beastclus(beast_tree, metadata, post_threshold = 0.50, date_range = 90, samearea = TRUE) |
| 24 | +beastclus <- function(beast_tree, metadata, post_threshold = 0.70, date_range = 90, samearea = FALSE) { |
| 25 | + library(treeio) |
| 26 | + library(ape) |
| 27 | + library(ggtree) |
| 28 | + library(dplyr) |
| 29 | + library(tidyr) |
| 30 | + |
| 31 | + # Calculate the total number of iterations for progress tracking |
| 32 | + tip_count <- length(beast_tree@phylo$tip.label) |
| 33 | + total_iterations <- (tip_count * (tip_count - 1)) / 2 |
| 34 | + |
| 35 | + # Set up the progress bar |
| 36 | + pb <- progress_bar$new( |
| 37 | + format = "(:spin) [:bar] :percent [Elapsed time: :elapsedfull || Estimated time remaining: :eta]", |
| 38 | + total = total_iterations, |
| 39 | + complete = "=", # Completion bar character |
| 40 | + incomplete = "-", # Incomplete bar character |
| 41 | + current = ">", # Current bar character |
| 42 | + clear = FALSE, # If TRUE, clears the bar when finish |
| 43 | + width = 100 # Width of the progress bar |
| 44 | + ) |
| 45 | + # Number of tips in the tree (leaf nodes) |
| 46 | + tree <- beast_tree@phylo |
| 47 | + num_tips <- length(tree$tip.label) |
| 48 | + |
| 49 | + # Extract internal node numbers using the edge matrix (parent-child relationships) |
| 50 | + internal_nodes <- unique(tree$edge[, 1][tree$edge[, 1] > num_tips]) |
| 51 | + |
| 52 | + # Create an empty list to store clades and their leaf nodes |
| 53 | + clade_leaf_nodes <- list() |
| 54 | + |
| 55 | + # Loop through each internal node and extract the clade |
| 56 | + for (i in internal_nodes) { |
| 57 | + # Extract the clade rooted at the current internal node |
| 58 | + clade <- extract.clade(tree, node = i) |
| 59 | + |
| 60 | + # Get the leaf (tip) names for the clade |
| 61 | + leaf_nodes <- clade$tip.label |
| 62 | + |
| 63 | + # Store the clade and its leaf nodes in the list |
| 64 | + clade_leaf_nodes[[as.character(i)]] <- leaf_nodes # Assign node number as the name |
| 65 | + } |
| 66 | + |
| 67 | + # Convert the list to a data frame for easier visualization |
| 68 | + clade_df <- data.frame( |
| 69 | + nodes = rep(names(clade_leaf_nodes), sapply(clade_leaf_nodes, length)), |
| 70 | + label = unlist(clade_leaf_nodes) |
| 71 | + ) |
| 72 | + |
| 73 | + # Merging posterior probability from tree data |
| 74 | + tre <- ggtree(beast_tree, mrsd = min(metadata$date)) + |
| 75 | + theme_tree2() + geom_range(range = 'length_0.95_HPD', color = 'red', alpha = .6, size = 2) |
| 76 | + post_dat <- tre$data |
| 77 | + post_dat <- select(post_dat, node, posterior, x) |
| 78 | + |
| 79 | + clade_df <- merge(clade_df, post_dat, by.x = "nodes", by.y = "node", all.x = TRUE) |
| 80 | + |
| 81 | + # Merging metadata |
| 82 | + metadata_2 <- select(metadata, label, location, date) |
| 83 | + clade_df <- merge(clade_df, metadata_2, by.x = "label", by.y = "label", all.x = TRUE) |
| 84 | + |
| 85 | + # Summarize data |
| 86 | + summary_df <- clade_df %>% |
| 87 | + group_by(nodes) %>% |
| 88 | + summarize( |
| 89 | + label = paste(label, collapse = ", "), # Merge all labels for the group |
| 90 | + Posterior = round(mean(posterior, na.rm = TRUE), 2), # Use mean posterior (or adjust as needed) |
| 91 | + AreaName = ifelse(length(unique(location)) > 1, # Check the number of unique locations |
| 92 | + paste(unique(location), collapse = ", "), |
| 93 | + unique(location)), # Use single location if only one |
| 94 | + min_date = min(as.Date(date)), # Minimum date in the group |
| 95 | + max_date = max(as.Date(date)), # Maximum date in the group |
| 96 | + NumTips = n(), # Count of label entries |
| 97 | + .groups = "drop" # Avoid grouping in the output |
| 98 | + ) |
| 99 | + |
| 100 | + summary_df$dif <- as.numeric(summary_df$max_date - summary_df$min_date) |
| 101 | + summary_df = rename(summary_df, c('ParentNode' = 'nodes')) |
| 102 | + |
| 103 | + # Filtering the data based on the posterior and date range criteria |
| 104 | + filtered_df <- summary_df %>% |
| 105 | + filter(Posterior > post_threshold, |
| 106 | + dif <= date_range, |
| 107 | + (samearea == TRUE & !grepl(",", AreaName)) | samearea == FALSE) |
| 108 | + |
| 109 | + # Separate the label into different rows based on ',' separator |
| 110 | + filtered_df <- filtered_df %>% |
| 111 | + separate_rows(label, sep = ", ") # Use ", " as separator to split the label |
| 112 | + |
| 113 | + # Remove duplicate labels, keeping the one with the maximum NumTips |
| 114 | + filtered_df <- filtered_df %>% |
| 115 | + group_by(label) %>% |
| 116 | + filter(NumTips == max(NumTips)) %>% |
| 117 | + ungroup() |
| 118 | + |
| 119 | + # Merge the `label` values with a comma separator |
| 120 | + final_df <- filtered_df %>% |
| 121 | + group_by(ParentNode) %>% |
| 122 | + summarize( |
| 123 | + label = paste(label, collapse = ", "), # Merge labels with a comma separator |
| 124 | + Posterior = first(Posterior), # Keep the first value of `Posterior` |
| 125 | + AreaName = first(AreaName), # Keep the first value of `AreaName` |
| 126 | + min_date = min(min_date), # Get the minimum date |
| 127 | + max_date = max(max_date), # Get the maximum date |
| 128 | + NumTips = first(NumTips), # Sum NumTips (if needed) |
| 129 | + dif = max(dif), # Get the maximum difference |
| 130 | + .groups = "drop" # Remove the grouping after summarizing |
| 131 | + ) |
| 132 | + |
| 133 | + # Convert ParentNode to numeric |
| 134 | + final_df$ParentNode <- as.numeric(final_df$ParentNode) |
| 135 | + |
| 136 | + #order date |
| 137 | + final_df <- final_df[order(final_df$min_date),] |
| 138 | + |
| 139 | + return(final_df) |
| 140 | +} |
| 141 | + |
| 142 | +utils::globalVariables(c("node", "posterior", "label", "location", "n")) |
0 commit comments