diff --git a/DESCRIPTION b/DESCRIPTION index 597eeb8..be28a5b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ -Package: huebreaker +Package: caIRA Type: Package -Title: Collecting the Cluster Based Tree and Metadata -Version: 0.1.2 +Title: Collecting and Identification from Outbreak Cluster Based Tree and Metadata +Version: 0.1.3 Author: Dhihram Tenrisau, Stéphane Hué Maintainer: Dhihram Tenrisau Description: This package is used to find the cluster-based tree and metadata. The cluster-based tree is built based on the clusters in the data. The metadata contains information about the data used to build the tree. @@ -9,11 +9,12 @@ License: file LICENSE Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Imports: ape, dplyr, - stringr + stringr, + progress Suggests: knitr, rmarkdown, diff --git a/NAMESPACE b/NAMESPACE index e97c101..c2ef08f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,6 @@ # Generated by roxygen2: do not edit by hand -export(huebreaker) +export(genclus) importFrom(ape,extract.clade) importFrom(ape,getMRCA) importFrom(dplyr,"%>%") @@ -15,5 +15,5 @@ importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,slice) importFrom(dplyr,ungroup) +importFrom(progress,progress_bar) importFrom(stringr,str_split) -importFrom(utils,globalVariables) diff --git a/NEWS.md b/NEWS.md index ba035dd..c84aa1a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# MyPackage 0.1.3 + +## New Features +- Change name package to caIRA. + +## Improvements +- Loading bar. + +## Bug Fixes +- Making function faster than before. + + # MyPackage 0.1.2 ## New Features diff --git a/R/huebreaker.R b/R/caIRA.R similarity index 59% rename from R/huebreaker.R rename to R/caIRA.R index 5474b9f..a615f15 100644 --- a/R/huebreaker.R +++ b/R/caIRA.R @@ -1,25 +1,28 @@ #' Collecting the Cluster Based Tree and Metadata #' -#' @param tree tree file -#' @param metat metadata dataframe contains `label`, `location`, and `date` -#' @param bootstrap_threshold Numeric, the bootstrap threshold -#' @param date_range Numeric, date range of the tip (days) -#' @param samearea Logical, whether to consider only groups from the same area +#' This function identifies monophyletic groups within a phylogenetic tree based on provided criteria, such as bootstrap threshold, date range, and geographic location. #' -#' @return a data frame containing the monophyletic groups that meet the criteria -#' @export +#' @param tree A phylogenetic tree object (likely of class `phylo` from the `ape` package). +#' @param metat A metadata dataframe containing at least the columns `label`, `location`, and `date`. +#' @param bootstrap_threshold Numeric value specifying the minimum bootstrap support required for a clade to be considered. +#' @param date_range Numeric value specifying the range of dates (in days) within which tips must fall to be considered as part of the same group. +#' @param samearea Logical flag indicating whether only groups from the same geographic area should be considered. #' +#' @return A dataframe containing the monophyletic groups that meet the specified criteria. +#' @export #' @importFrom ape getMRCA extract.clade -#' @importFrom dplyr %>% rowwise mutate ungroup filter select inner_join arrange desc group_by slice bind_rows +#' @importFrom dplyr %>% filter select mutate rowwise ungroup inner_join arrange desc group_by slice bind_rows #' @importFrom stringr str_split -#' @importFrom utils globalVariables +#' @importFrom progress progress_bar #' #' @examples #' # Load necessary packages #' library(ape) #' library(dplyr) #' library(stringr) -#' # Generate a random tree with 20 tips +#' library(progress) +#' +#' # Generate a random tree with 20 tips #' tree <- rtree(n = 20) #' # Generate random bootstrap values for each node #' set.seed(666) @@ -33,9 +36,10 @@ #' random_dates <- sample(seq(start_date, end_date, by = "day"), size = length(tree$tip.label), replace = TRUE) #' # Create a data frame with the metadata #' metadata <- data.frame(label = tree$tip.label, location = areas, date = random_dates) -#' # Run the huebreaker function -#' huebreaker(tree, metadata, bootstrap_threshold = 90, date_range = 30, samearea = TRUE) -huebreaker <- function(tree, metat, bootstrap_threshold, date_range, samearea) { +#' # Run the genclus function +#' genclus(tree, metadata, bootstrap_threshold = 90, date_range = 30, samearea = TRUE) +genclus <- function(tree, metat, bootstrap_threshold, date_range, samearea) { + bootstrap_threshold <- as.numeric(bootstrap_threshold) date_range <- as.numeric(date_range) samearea <- as.logical(samearea) @@ -45,22 +49,56 @@ huebreaker <- function(tree, metat, bootstrap_threshold, date_range, samearea) { stop("The tree does not contain bootstrap values.") } - # Extract monophyletic groups - groups <- list() + # Identify the root node (usually it's the first node after the tips) + root_node <- length(tree$tip.label) + 1 + + # Calculate the total number of iterations for progress tracking tip_count <- length(tree$tip.label) + total_iterations <- (tip_count * (tip_count - 1)) / 2 + + # Set up the progress bar + pb <- progress_bar$new( + format = "(:spin) [:bar] :percent [Elapsed time: :elapsedfull || Estimated time remaining: :eta]", + total = total_iterations, + complete = "=", # Completion bar character + incomplete = "-", # Incomplete bar character + current = ">", # Current bar character + clear = FALSE, # If TRUE, clears the bar when finish + width = 100 # Width of the progress bar + ) + + # Extract monophyletic groups with bootstrap value filtering first + groups <- list() + for (i in 1:(tip_count - 1)) { for (j in (i + 1):tip_count) { + # Update progress + pb$tick() + common_ancestor <- getMRCA(tree, c(tree$tip.label[i], tree$tip.label[j])) - clade <- extract.clade(tree, common_ancestor) - if (!is.null(clade) && all(clade$tip.label %in% tree$tip.label)) { - group <- sort(clade$tip.label) - group_name <- paste(group, collapse = ", ") - if (!group_name %in% names(groups)) { - groups[[group_name]] <- list( - tips = group, - bootstrap_value = tree$node.label[common_ancestor - length(tree$tip.label)], - parent_node = common_ancestor - ) + + # Determine bootstrap value, set to NA if the node is the root + if (common_ancestor == root_node) { + bootstrap_value <- NA + } else { + bootstrap_value <- tree$node.label[common_ancestor - length(tree$tip.label)] + } + + # Proceed only if the bootstrap value is NA (root) or meets the threshold + if (is.na(bootstrap_value) || bootstrap_value >= bootstrap_threshold) { + clade <- extract.clade(tree, common_ancestor) + + if (!is.null(clade) && all(clade$tip.label %in% tree$tip.label)) { + group <- sort(clade$tip.label) + group_name <- paste(group, collapse = ", ") + + if (!group_name %in% names(groups)) { + groups[[group_name]] <- list( + tips = group, + bootstrap_value = bootstrap_value, + parent_node = common_ancestor + ) + } } } } @@ -99,18 +137,18 @@ huebreaker <- function(tree, metat, bootstrap_threshold, date_range, samearea) { ) %>% ungroup() - # Convert Bootstrap to numeric + # Convert Bootstrap to numeric (though already filtered above, it's good to ensure this) monophyletic_df <- monophyletic_df %>% mutate(Bootstrap = as.numeric(Bootstrap)) - # Filter the DataFrame based on bootstrap values, same area if required, and date range within given days + # Filter the DataFrame based on the same area if required and date range within given days if (samearea) { filtered_df <- monophyletic_df %>% - filter(Bootstrap >= bootstrap_threshold, AllSameArea == TRUE, diff <= date_range) %>% + filter(AllSameArea == TRUE, diff <= date_range) %>% select(Group, Tips, Bootstrap, ParentNode, AreaName, MinDate, MaxDate, diff) } else { filtered_df <- monophyletic_df %>% - filter(Bootstrap >= bootstrap_threshold, diff <= date_range) %>% + filter(diff <= date_range) %>% select(Group, Tips, Bootstrap, ParentNode, AreaName, MinDate, MaxDate, diff) } @@ -160,3 +198,4 @@ huebreaker <- function(tree, metat, bootstrap_threshold, date_range, samearea) { # Declare global variables to avoid R CMD check warnings utils::globalVariables(c("Tips", "TipList", "Areas", "AllSameArea", "Dates", "MaxDate", "MinDate", "Bootstrap", "Group", "ParentNode", "AreaName", "ID", "NumTips")) + diff --git a/huebreaker.Rproj b/caIRA.Rproj similarity index 100% rename from huebreaker.Rproj rename to caIRA.Rproj diff --git a/docs/index.Rmd b/docs/index.Rmd index 4c329d5..59345de 100644 --- a/docs/index.Rmd +++ b/docs/index.Rmd @@ -1,8 +1,8 @@ --- -title: "Collecting the Cluster with huebreaker" +title: "Collecting and Identification the Outbreak Cluster" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Collecting the Cluster with huebreaker} + %\VignetteIndexEntry{Collecting and Identification the Outbreak Cluster} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -14,13 +14,19 @@ knitr::opts_chunk$set( ) ``` -The goal of huebreaker is to find the cluster based tree and metadata of the data. The package is based on the paper: +The goal of Collecting and Identification the Outbreak Cluster (caIRA) is to find the cluster based tree and metadata of the data. The package is based on the paper: Ragonnet-Cronin, M., Hodcroft, E., Hué, S. et al. Automated analysis of phylogenetic clusters. BMC Bioinformatics 14, 317 (2013). Hall M, Woolhouse M, Rambaut A (2015) Epidemic Reconstruction in a Phylogenetics Framework: Transmission Trees as Partitions of the Node Set. PLoS Comput Biol 11(12): e1004613. -This package is the part of Dhihram Tenrisau, MSc Health Data Science summer project, 'Phylodynamic of Norovirus in UK 2003-2023'. The project is supervised by Stéphane Hué +This package is the part of Dhihram Tenrisau, MSc Health Data Science summer project, 'Phylodynamic of Norovirus in UK 2003-2023'. The project is supervised by Stéphane Hué. + +By using this package, the hope will be emerging (not the the disease emerging) like the lyrics of the song 'Ah Ça Ira': + +*Ah ça ira, réjouis-toi!* (It'll be fine, rejoice!) + +*Ah ça ira, le bon temps viendra* (It'll be fine, good times will come) ## Installation @@ -28,7 +34,7 @@ You can install the development version of huebreaker from [GitHub](https://gith ``` r # install.packages("devtools") -devtools::install_github("Dhihram/huebreaker") +devtools::install_github("Dhihram/caIRA") ``` ## Example @@ -36,11 +42,11 @@ devtools::install_github("Dhihram/huebreaker") This package needs the additional package `tidyverse`, `ape`, `treeio`, and `dplyr` ```{r, warning=FALSE, message=FALSE} -library(huebreaker) library(tidyverse) library(ape) library(dplyr) library(treeio) +library(caIRA) ``` ### Data @@ -75,15 +81,12 @@ gg ### Package Utilization -This package will utilize: -1. Finding and clustering the monophylectic groups in the tree -2. Add the parameter of the clusters: `bootstrap_treshold`, `data_range`, and `samearea` -3. Keep the maximum monophylectic groups in the cluster identify +This package will utilize: 1. Finding and clustering the monophylectic groups in the tree 2. Add the parameter of the clusters: `bootstrap_treshold`, `data_range`, and `samearea` 3. Keep the maximum monophylectic groups in the cluster identify the `bootstrap_treshold` is the minimum bootstrap value to be considered as a cluster. The `data_range` is the range of the days to be considered as a cluster. The `samearea` is the boolean value to consider the same area as a cluster. ```{r} -res <- huebreaker(tree, metat, bootstrap_threshold = 80, date_range = 30, samearea = TRUE) +res <- genclus(tree, metat, bootstrap_threshold = 80, date_range = 30, samearea = TRUE) knitr::kable(res) ``` diff --git a/docs/index.html b/docs/index.html index 96ca026..0e4d634 100644 --- a/docs/index.html +++ b/docs/index.html @@ -12,7 +12,7 @@ -Collecting the Cluster with huebreaker +Collecting and Identification the Outbreak Cluster