diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 7a464d8..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.RData b/.RData deleted file mode 100644 index 398bd62..0000000 Binary files a/.RData and /dev/null differ diff --git a/.Rhistory b/.Rhistory deleted file mode 100644 index 4458a9e..0000000 --- a/.Rhistory +++ /dev/null @@ -1,33 +0,0 @@ -devtools::install() -library(MolPad) -library(dplyr) -library(MolPad) -library(dplyr) -data("cheese") -cheesedata <- cheese |> -dplyr::mutate(annotations["kingdom"]) |> -dplyr::rename_with(.cols = 12, ~"type")|> -pre_process() -pathchee <- gAnnotation(annotations,"phylum","class") -# rename(!!enque(arg1)) -# run cmd chekc and cran submit -# wgedon prepare -# edit plot x -- section -View(pathchee) -library(pkgdown) -pkgdown::clean_site() -setwd("/Users/hazelma/Documents/GitHub/MolPad") -pkgdown::clean_site() -pkgdown::build_site() -library(MolPad) -data.frame("ID"=1:5,"Day_1"=rnorm(5),"Day_2"=rnorm(5),"Day_3"=rnorm(5),"Day_4"=rnorm(5),"Day_5"=rnorm(5)) -x <- data.frame("ID"=1:5,"Day_1"=rnorm(5),"Day_2"=rnorm(5),"Day_3"=rnorm(5),"Day_4"=rnorm(5),"Day_5"=rnorm(5)) -x -View(x) -x <- data.frame("ID"=1:5,"Day_1"=rnorm(5),"Day_2"=rnorm(5),"Day_3"=rnorm(5),"Day_4"=rnorm(5),"Day_5"=rnorm(5)) -x -x -x <- data.frame("ID"=1:5,"Day_1"=rnorm(5),"Day_2"=rnorm(5),"Day_3"=rnorm(5),"Day_4"=rnorm(5),"Day_5"=rnorm(5),"Day_6"=rnorm(5),"Day_7"=rnorm(5),"Day_8"=rnorm(5),"type"=c("peptide"*3,"lipid","metabolite")) -x <- data.frame("ID"=1:5,"Day_1"=rnorm(5),"Day_2"=rnorm(5),"Day_3"=rnorm(5),"Day_4"=rnorm(5),"Day_5"=rnorm(5),"Day_6"=rnorm(5),"Day_7"=rnorm(5),"Day_8"=rnorm(5),"type"=c(rep("peptide",3),"lipid","metabolite")) -x -x diff --git a/.Rhistory 2 b/.Rhistory 2 deleted file mode 100644 index 76d31be..0000000 --- a/.Rhistory 2 +++ /dev/null @@ -1,97 +0,0 @@ -knitr::opts_chunk$set(echo = TRUE) -#-----------------------update package---------------------- -library(devtools) -setwd("/Users/hazelma/Documents/GitHub") -#create("MolPad") -document("MolPad") #important: generate man -setwd("/Users/hazelma/Documents/GitHub") -#create("MolPad") -document("MolPad") #important: generate man -setwd("/Users/hazelma/Documents/GitHub") -setwd("/Users/hazelma/Documents/GitHub") -#create("MolPad") -document("MolPad") #important: generate man -#-----------------------update package---------------------- -library(devtools) -#create("MolPad") -document("MolPad") #important: generate man -knitr::opts_chunk$set(echo = TRUE) -data("test_data") -gDashboard(cheesedata,cluschee,pathchee,networkchee,dashboardtitle = "Test",id_colname = c("GO_ID","KEGG_ID"),id_type = c("GO","KEGG")) -gDashboard(test_data_processed,test_cluster,test_annotations_processed,test_network,dashboardtitle = "Test",id_colname = c("GO_ID","KEGG_ID"),id_type = c("GO","KEGG")) -traceback() -#create("MolPad") -document("MolPad") #important: generate man -#create("MolPad") -document("MolPad") #important: generate man -#create("MolPad") -document("MolPad") #important: generate man -#create("MolPad") -document("MolPad") #important: generate man -#create("MolPad") -document("MolPad") #important: generate man -#create("MolPad") -document("MolPad") #important: generate man -#create("MolPad") -document("MolPad") #important: generate man -gDashboard(test_data_processed,test_cluster,test_annotations_processed,test_network,dashboardtitle = "Test",id_colname = c("GO_ID","KEGG_ID"),id_type = c("GO","KEGG")) -save(test_data,test_data_processed, -test_annotations,test_annotations_processed, -test_cluster,test_network,file="test_data.RData") -use_test("pre_process.R") -setwd("/Users/hazelma/Documents/GitHub/MolPad") -use_test("pre_process.R") -View(test_data) -View(test_data_processed) -View(test_data) -View(test_data_processed) -View(test_data) -test_data[5,4]. <- NA -test_data[5,4] <- NA -View(test_data) -save(test_data,test_data_processed, -test_annotations,test_annotations_processed, -test_cluster,test_network,file="test_data.RData") -#create("MolPad") -document("MolPad") #important: generate man -setwd("/Users/hazelma/Documents/GitHub") -#create("MolPad") -document("MolPad") #important: generate man -setwd("/Users/hazelma/Documents/GitHub/MolPad") -View(test_annotations_processed) -View(test_data) -View(test_data_processed) -use_test("gClusters.R") -class(test_cluster[[2]]) -check <- class(test_cluster[[2]]) -class(test_cluster[[2]]) -rm(check) -class(class(test_cluster[[2]])) -class(test_cluster[[2]]) -devtools::test() -use_test("pre_process.R") -use_test("gClusters.R") -use_test("gAnnotation.R") -View(test_annotations_processed) -use_test("gNetwork") -use_test("gDashboard.R") -use_test("reshape_for_make_functions") -devtools::test() -use_test("gClusters.R") -use_test("gNetwork") -View(test_network) -View(test_network) -devtools::test() -use_test("gClusters.R") -ggplot_build(test_cluster[[2]]) -ggplot_build(test_cluster[[2]])$data -ggplot_build(test_cluster[[2]])$data$y -ggplot_build(test_cluster[[2]])$data -ggplot_build(test_cluster[[2]])$data[,"y"] -ggplot_build(test_cluster[[2]])$data[,2] -ggplot_build(test_cluster[[2]])$data -ggplot_build(test_cluster[[2]])$data[[1]] -is.na(ggplot_build(test_cluster[[2]])$data[[1]][,1:2]) -sum(is.na(ggplot_build(test_cluster[[2]])$data[[1]][,1:2])) -devtools::test() -use_test("gDashboard.R") # not finished yet diff --git a/MolPad.Rproj b/MolPad.Rproj deleted file mode 100644 index 69fafd4..0000000 --- a/MolPad.Rproj +++ /dev/null @@ -1,22 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes -LineEndingConversion: Posix - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace diff --git a/R/.DS_Store b/R/.DS_Store deleted file mode 100644 index cbf97c7..0000000 Binary files a/R/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index e2d6bc1..067c069 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ An R-Shiny Package for Cluster Co-Expression Analysis in Longitudinal Microbiomi ## Overview -MolPad offers a visualization dashboard tool designed to enhance our understanding of how molecular co-expression works in the context of microbiome data. The approach involves using a cluster network to provide an initial overview of relationships across multiple omics, with the added functionality to interactively zoom in on specific areas of interest. To facilitate this analysis, we've developed a focus-plus-context strategy that seamlessly connects to online curated annotations. +MolPad offers a visualization dashboard tool designed to enhance our understanding of how molecular co-expression works in the context of microbiome data. The approach involves using a cluster network to provide an initial overview of relationships across multiple omics, with the added functionality to interactively zoom in on specific areas of interest. To facilitate this analysis, we've developed a focus-plus-context strategy that connects to online curated annotations. Additionally, our package simplifies the entire pipeline for creating the dashboard. This user-friendly design makes it accessible even to people with limited R programming experience. Check out our cheese-data demo [here](https://connect.doit.wisc.edu/molpad-demo/). @@ -22,7 +22,7 @@ install.packages("MolPad") ### MolPad could help you with: -1. Clutering the data with k-means and building a group network. +1. Clustering the data with k-means and building a group network. 2. Find the significant trend patterns in your datasets. 3. Target the interaction between groups, taxons, and pathways. 4. Visualize the distribution of features in specific pathways on the group network. @@ -46,6 +46,6 @@ If you need assistance with MolPad, there are two primary ways to seek help: Remember that it's particularly effective when you can provide a reproducible example that shows the specific problem you're having. ## Contribution -To contribute to this project, you could use the following workflow: -fork the repository --> create vour local copy --> submit a pull request. +To contribute to this project, you could use the following workflow: fork the repository --> create your local copy --> submit a pull request. + diff --git a/index.md b/index.md deleted file mode 100644 index 0530d04..0000000 --- a/index.md +++ /dev/null @@ -1,32 +0,0 @@ -MolPad: An R-Shiny Package for Cluster Co-Expression Analysis in Longitudinal Microbiomics -================ -2023-05-31 - -## Overview - -MolPad offers a visualization dashboard tool designed to enhance our understanding of how molecular co-expression works in the context of microbiome data. The approach involves using a cluster network to provide an initial overview of relationships across multiple omics, with the added functionality to interactively zoom in on specific areas of interest. To facilitate this analysis, we've developed a focus-plus-context strategy that seamlessly connects to online curated annotations. - -The dashboard itself comprises several components, including a cluster-level network, a bar plot illustrating taxonomic composition, a line plot displaying data modalities, and a table for each pathway. You can see an illustration of these features in this screenshot. - - - -Additionally, our package simplifies the entire pipeline for creating the dashboard. This user-friendly design makes it accessible even to students with limited R programming experience. - -## MolPad could help you with: - -1. Clutering the data with k-means and building a group network. -2. Find the significant trend patterns in your datasets. -3. Target the interaction between groups, taxons, and pathways. -4. Visualize the distribution of features in specific pathways on the group network. -5. Search for particular features and other user-defined labels. -6. Check detailed information on each feature through automatically generated hyperlinks. -7. Have a better overall understanding of the datasets. - - -## Workflow - - - -The first input is a list of datasets, where each should have been normalized and imputed. The second input is a pathway dataset. - -Note that for all the datasets, the first column must be `ID`. For more information, please check the corresponding functions. \ No newline at end of file diff --git a/paper/paper.md b/paper/paper.md index 7e920df..c036001 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -15,7 +15,7 @@ authors: orcid: 0000-0002-9415-1971 affiliation: "1, 2" - name: Margaret Thairu - orcid: (orcid). + orcid: 0000-0002-2799-6261 affiliation: 2 affiliations: - name: University of Wisconsin-Madison, Department of Statistics, USA @@ -27,42 +27,43 @@ bibliography: paper.bib # Optional fields if submitting to a AAS journal too, see this blog post: # https://blog.joss.theoj.org/2018/12/a-new-collaboration-with-aas-publishing -aas-doi: 10.3847/xxxxx <- update this with the DOI from AAS once you know it. -aas-journal: Astrophysical Journal <- The name of the AAS journal. +aas-doi: +aas-journal: --- # Summary -The R-Shiny package MolPad provides an interactive dashboard for understanding the dynamics of longitudinal molecular co-expression in microbiomics. The main idea for addressing the issue is first to use a network to overview major patterns among their predictive relationships and then zoom into specific clusters of interest. It is designed with a focus-plus-context analysis strategy and automatically generates links to online curated annotations. The dashboard consists of a cluster-level network, a bar plot of taxonomic composition, a line plot of data modalities, and a table for each pathway, as illustrated in Fig \ref{fig:dashboard}. Plus, the package includes functions that handle the data processing for creating the dashboard. This makes it beginner-friendly for users with less R programming experience. We illustrate these methods with a case study on a longitudinal, multi-platform metagenomics analysis for cheese communities. +The R-Shiny package MolPad provides an interactive dashboard for understanding the dynamics of longitudinal molecular co-expression in microbiomics. The main idea for addressing the issue is first to use a network to overview major patterns among their predictive relationships and then zoom into specific clusters of interest. It is designed with a focus-plus-context analysis strategy and automatically generates links to online curated annotations. The dashboard consists of a cluster-level network, a bar plot of taxonomic composition, a line plot of data modalities, and a table for each pathway, as illustrated in Fig \ref{fig:dashboard}. Further, the package includes functions that handle the data processing for creating the dashboard. This makes it beginner-friendly for users with less R programming experience. We illustrate these methods with a case study on a longitudinal, multi-platform metagenomics analysis for cheese communities. # Statement of need -The realm of microbiomics is expanding rapidly, with numerous new studies and methodologies emerging [@BOKULICH20204048]. This highlights the need for visual exploration tools that can account for interaction across biological modalities [@6094057]. It’s also important to enable interpretations of dynamics and network structure because these have specific meanings in the genomic context [@corel:hal-01300043]. Another issue is the annotation. The special modality characteristic of microbiomics determines that each identical feature can be classified with various taxons and could have several IDs in different databases [@https://doi.org/10.1002/pro.3711]. Although the annotation is available online, it can be tedious to search for parts manually. Moreover, most present visualizations poorly evaluate longitudinal change across microbiomes. In longitudinal data, we need to gain insight into the functioning of how individual features change and how they may influence related features. Thus, it depends upon analysis within one table and across tables. All of these have posed a challenge for unified visualization and interpretation. +The realm of microbiomics is expanding rapidly, with numerous new studies and methodologies emerging [@BOKULICH20204048]. This highlights the need for visual exploration tools that can account for interaction across biological modalities [@6094057]. It’s also important to enable interpretations of dynamics and network structure because these have specific meanings in the genomic context [@corel:hal-01300043]. Another issue is the annotation of notable features. A characteristic of microbioem data is that each identical feature can be classified at several levels of taxonomic resolution and could have several IDs in different databases [@https://doi.org/10.1002/pro.3711]. Although relevant annotation is typically available online, it can be tedious to search through databases manually. Moreover, microbiome data often exhibit longitudinal variation. In this context, we must gain insight into the functioning of how individual features change and how they may influence related features. Thus, it depends upon analysis within one table and across tables. All of these have posed a challenge for unified visualization and interpretation. -In response to the above issues, previous studies on interactive visualization tools have designed methods to work on such data. `microViz` [@microviz] provides a Shiny app for interactive exploration by pairing ordination plots and composition circular bar charts to show each taxon's prevalence and abundance. `GWENA` [@Lemoine_Scott-Boyer_Ambroise_Périn_Droit_2021] applies a network in conducting gene co‑expression analysis and extended module characterization in a single package to understand the underlying processes contributing to a disease or a phenotype. `NeVOmics` [@Zúñiga-León_Carrasco-Navarro_Fierro_2018] improved compatibility with a dynamic dashboard and facilitated the functional characterization of data from omics technologies. It also integrates Over-representation analysis methodology and network-based visualization to show the enrichment results. These methods suggest the mechanisms that improve the utility of microbiomics visualization tools under analysis. +In response to the above issues, previous studies on interactive visualization tools have designed methods to work on such data. `microViz` [@microviz] provides a Shiny app for interactive exploration by pairing ordination plots and composition circular bar charts to show each taxon's prevalence and abundance. `GWENA` [@Lemoine_Scott-Boyer_Ambroise_Périn_Droit_2021] applies a network in conducting gene co‑expression analysis and extended module characterization in a single package to understand the underlying processes contributing to a disease or a phenotype. `NeVOmics` [@Zúñiga-León_Carrasco-Navarro_Fierro_2018] improved compatibility with a dynamic dashboard and facilitated the functional characterization of data from omics technologies. It also integrates over-representation analysis and network-based visualization to display enrichment results. These methods suggest the mechanisms that improve the utility of microbiomics visualization tools under analysis. # Methods ## Network Generation -We first scale and cluster the trajectories across all molecular features to depict the longitudinal changes. For clustering, we use K-means and a built-in elbow method to choose the optimal number. Then, we predict a co-expression network for the extracted patterns, similar to what GENIE3 [@GENIE3] does to create a genetic regulatory network. We also divide the prediction process into individual regression tasks. Each central pattern of a cluster is predicted from the expression patterns of all the other central patterns, using tree-based ensemble methods Random Forests. It is chosen because of its potential to deal with interacting features and non-linearity without making any extra assumptions. The Mean Decrease Accuracy of a subset of top predictors whose expression directly influences the expression of the target cluster is taken as an indication of a putative link. That is to say, based on the random forest prediction, if two groups of features are highly linked according to the network, they will have strongly related longitudinal patterns, as shown in Fig \ref{fig:pattern}. +We first scale and cluster the trajectories across all molecular features to depict the longitudinal changes. For clustering, we use K-means and a built-in elbow method to choose the optimal number. Then, we predict a co-expression network for the extracted patterns, similar to GENIE3 [@GENIE3] creates gene regulatory networks. We also divide the prediction process into individual regression tasks. Each central pattern of a cluster is predicted from the expression patterns of all the other central patterns, using random forests. It is chosen because of its potential to model interacting features and non-linearity without strong assumptions. The Mean Decrease Accuracy of a subset of top predictors whose expression directly influences the expression of the target cluster is taken as an indication of a putative link. That is to say, based on the random forest prediction, if two groups of features are highly linked according to the network, they will have strongly related longitudinal patterns, as shown in Fig \ref{fig:pattern}. ## Network Navigation -Navigating the network in the MolPad dashboard follows three steps, as shown in Fig \ref{fig:flow}: First, choose a primary functional annotation. Adjustment options for fine-tuning include network layout and importance threshold for edge density. Nodes that turn bright green (Fig \ref{fig:dashboard}.A) represent clusters containing the most features in the chosen functional annotation. Second, brushing on the network reveals patterns of taxonomic composition (Fig \ref{fig:dashboard}.B) and typical trajectories (Fig \ref{fig:dashboard}.C). The user can also zoom into specific taxonomic annotations by filtering. Third, view the feature table (Fig \ref{fig:dashboard}.D) , examine the drop-down options for other related function annotations, and click the link for online information on the interested items. The interface is designed to support iterative exploration, encouraging the use of several steps to answer specific questions, like comparing the pattern distribution between two functions or finding functionally important community members metabolizing a feature of interest. Overall, this aggregation adopted the focus-plus-context approach to address the low interoperability of the network graph, facilitating the examination of high-level details for individual features while providing contextual information about cluster interactions among microbiome data. +Navigating the network in the MolPad dashboard follows three steps, as shown in Fig \ref{fig:flow}: First, choose a primary functional annotation. Adjustment options for fine-tuning include network layout and importance threshold for edge density. Bright green notes (Fig \ref{fig:dashboard}.A) represent clusters containing the most features in the chosen functional annotation. Second, brushing on the network reveals patterns of taxonomic composition (Fig \ref{fig:dashboard}.B) and typical trajectories (Fig \ref{fig:dashboard}.C). The user can also zoom into specific taxonomic annotations by filtering. Third, they may view the feature table (Fig \ref{fig:dashboard}.D), examine the drop-down options for other related function annotations, and click the link for online details for the items of interest. The interface is designed to support iterative exploration, encouraging the use of several steps to answer specific questions, like comparing the distributional patterns between two functions or finding functionally important community members metabolizing a feature of interest. By applying a focus-plus-context approach, we can bridge the examination of high-level details related to individual features with contextual information about cluster interactions within the network visualization. ![Overview and workflow of using MolPad package. \label{fig:flow}](flow.png){ width=70% } + # Case Study: Cheese Data -Here we aim to highlight the versatility of the MolPad Dashboard with a case study of microbial communities on the wash-rind cheese' surface collected during cheese ripening [@doi:10.1128/msystems.00701-22]. This data stands for a general case that only includes single-omic measurements for the change of Bacteria or Eukaryota in each cheese sample. It has multiple nested annotation labels ranging from kingdom to class, making it more flexible in interpretation. +Here we aim to highlight the versatility of the MolPad Dashboard with a case study of microbial communities on the wash-rind cheese' surface collected during cheese ripening [@doi:10.1128/msystems.00701-22]. It has multiple nested annotation labels ranging from kingdom to class, allowing flexible interpretation at multiple levels of resolution. -Our goal is to verify their conclusions and provide an alternative to visualize complicated longitudinal data. According to the study, in the bacterial community, Firmicutes are dominant at the very beginning, and Proteobacteria quickly take over the domination by the end of ripening. Overall, cheeses A and C show a reproducible establishment of Actinobacteria and Bacteroidetes separately. To confirm the mentioned findings using the MolPad dashboard, we examined two cheeses (A and C) across all three batches during weeks 2 to 13. +Our goal is to verify conclusions from the original publication and provide an alternative visualization of the complex longitudinal measurements. According to the study, in the bacterial community, Firmicutes are dominant at initial timepoints and Proteobacteria quickly take over to dominate sample composition by the end of ripening. Further, cheeses Actinobacteria and Bacteroidetes were found to establish themselves in the final cheese A and C communities. To confirm these findings using the MolPad dashboard, we examined cheeses A and C across all three batches during weeks 2 to 13. -In applying the dashboard, we made an extended time series by connecting the last time point of cheese A with the first one of cheese C. This allowed us to track unusual pattern combinations among different species and stages. We take the top four groups from the bacterial community for detailed analysis in Fig \ref{fig:cheesecase}. Overall, our results match the above research and could be used to provide intuitive explanations in supporting the findings, which substantiate the capabilities of `MolPad` as a reproducible tool to streamline the visualization of longitudinal patterns. +In applying the dashboard, we concatenated the time series for cheeses A and C. This allowed us to track unusual pattern combinations among different species and stages. We take the top four groups from the bacterial community for detailed analysis in Fig \ref{fig:cheesecase}. Overall, our results match the above research and could be used to provide intuitive explanations in supporting the findings, which substantiate the capabilities of `MolPad` as a reproducible tool to streamline the visualization of longitudinal patterns. # Usage -The source code for `MolPad` is stored on [Github](https://github.com/KaiyanM/MolPad). The app is hosted in the R package which can be downloaded and run on a local computer. We anticipate that some users may need more flexibility in their analyses, requiring backend R coding for tasks like setting up detailed operating models or downloading figure outputs. For such needs, the essential set of R functions employed in the Shiny app is accessible through the R package. +The source code for `MolPad` is available on [Github](https://github.com/KaiyanM/MolPad). The app is hosted in the R package which can be downloaded and run locally. We anticipate that some users may need more flexibility in their analyses, requiring backend R development for tasks like setting up detailed operating models or downloading figure outputs. For such needs, the essential set of R functions employed in the Shiny app is accessible through the R package. # Figures diff --git a/test.R b/test.R deleted file mode 100644 index 1230833..0000000 --- a/test.R +++ /dev/null @@ -1,13 +0,0 @@ -setwd("/Users/hazelma/Documents/GitHub/MolPad") -usethis::use_testthat(3) - -data("test_data") -use_r() -use_test("pre_process.R") -use_test("gClusters.R") -use_test("gAnnotation.R") -use_test("gNetwork") -use_test("gDashboard.R") # removed -use_test("reshape_for_make_functions") -use_test("paste_URL") -devtools::test()