gwas scratch #2

TheRensselaerIDEA · Jul 15, 2020 · 21292cb · 21292cb
1 parent 160e50e
commit 21292cb
Show file tree

Hide file tree

Showing 3 changed files with 279 additions and 14 deletions.
diff --git a/GWAS/GWAS.R b/GWAS/GWAS.R
@@ -0,0 +1,62 @@
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_knit$set(root.dir = "../")
+
+source("./Modules/Source.R")
+source("./GWAS/helper.R")
+
+data <- readRDS("./Preprocessing_FTS_Outputs/06-28-2020data.Rds")
+
+to_cluster <- subset(data, select = c(fips, Deaths))
+
+to_cluster <- drop_na(to_cluster)
+
+mort.cluster.raw <- cluster.counties(to_cluster, cluster.method="kmeans", cluster.num=4)
+
+mort.avg.cluster.raw <- data %>%
+                          dplyr::right_join(clustered, by = "fips") %>%
+                          dplyr::group_by(cluster) %>%
+                          dplyr::summarise(
+                            death_rate = sum(Deaths) / sum(population) * 10^5,
+                            count = n()
+                          ) %>% 
+                          dplyr::ungroup()
+
+
+mort.cluster.map <- mort.avg.cluster.raw %>% 
+                      dplyr::arrange(death_rate) %>% 
+                      dplyr::mutate(ord = as.character(1:n())) %>% 
+                      dplyr::select(-c(death_rate)) %>% 
+                      textshape::column_to_rownames("cluster")
+
+mort.cluster.ord <- order.county.clusters(mort.cluster.raw, mort.cluster.map)
+
+mort.avg.cluster.ord <- dplyr::mutate(mort.avg.cluster.raw, cluster = mort.cluster.map[cluster, "ord"])
+
+mort.rate <- data %>% 
+               dplyr::select(fips, death_rate)
+
+kendall.cor <- mort.rate %>% 
+                 dplyr::mutate(VAR = death_rate) %>%
+                 kendall.func(chr.data.2019) %>%
+                 dplyr::mutate(
+                   DIR = dplyr::if_else(
+                     kendall_cor <= 0,
+                     "Protective",
+                     "Destructive"
+                     )
+                   #chr_code = chr.namemap.2019[chr_code, 1]
+                 ) %>% na.omit() %>% 
+                 dplyr::arrange(desc(kendall_cor)) %>% 
+                 dplyr::mutate(chr_code = reorder(chr_code, kendall_cor))
+
+selected_SDs <- cbind(kendall.cor, state = state)
+
+#using Benjamin Hochberg's p.adjust() method to adjust P-values for multiple hypothesis testing to filter out factors that are less relevant.
+corrected_pvalues <- p.adjust(selected_SDs$kendall_p)
+selected_SDs <- cbind(selected_SDs, Corrected_P_Vals = corrected_pvalues)
+selected_SDs <- selected_SDs %>%
+  dplyr::filter(Corrected_P_Vals < 0.05) %>% 
+  dplyr::arrange(desc(kendall_cor))
+
+#Printing out the few selected factors for deaths of despair.
+print(selected_SDs)
diff --git a/GWAS.rmd → GWAS/GWAS.rmd b/GWAS.rmd → GWAS/GWAS.rmd
@@ -8,20 +8,6 @@ output: html_document
 ```{r setup, include=FALSE}
 knitr::opts_chunk$set(echo = TRUE)
 ```
-
-## Library
-
-```{r}
-# set.seed
-library("simEd")
-# kmeans
-library("stats")
-# diana
-library("cluster") 
-# cutree
-library("dendextend")
-```
-
 ## Principal Component Analysis
 
 ```{r}

diff --git a/GWAS/helper.R b/GWAS/helper.R
@@ -0,0 +1,217 @@
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_knit$set(root.dir = "../")
+
+source("./Modules/Source.R")
+
+# Principal Component Analysis
+pca.func <- function(data.mat, row.name, seed = 20) {
+  set.seed(seed)
+  pca.result <- prcomp(data.mat, retx = TRUE, center = FALSE)
+  pca.result[["row.name"]] <- row.name
+  return(pca.result)
+}
+
+# km.func
+
+#' Kmeans clustering function
+#'
+#' @param data.mat data.frame in which one column is "county_fips" and other columns
+#'                  are data to cluster by
+#' @param cluster.num number of clusters to form 
+#' @param seed random seed to use, is variable so that results can be replicated
+#'
+#' @return a tribble with two columns:
+#'    county_fips: unique identifier for each county
+#'    cluster: number corresponding to cluster county belongs to
+#'
+#' @examples
+#' cdc.data %>%
+#'   cdc.mort.mat(input$state_choice, input$death_cause) %>%
+#'   km.func(4)
+#'   
+#' km.func(cdc.mort.mat(cdc.data, "NY", "Despair"), 5)
+#' 
+#' @author 
+#' @export
+km.func <- function(data.mat, cluster.num=3, seed=200) {
+
+  set.seed(seed)
+  cluster.num <- min(nrow(data.mat) - 1, cluster.num)
+  data.mat <- na.omit(data.mat)
+
+  km.result <- kmeans(dplyr::select(data.mat, -fips), cluster.num)
+  return(
+    tibble(
+      fips = dplyr::pull(data.mat, fips),
+      cluster = as.character(km.result$cluster)
+    )
+  )
+}
+
+
+# diana.func
+
+#' Diana clustering function
+#'
+#' @param data.mat data.frame in which one column is "county_fips" and other columns
+#'                  are data to cluster by
+#' @param cluster.num number of clusters to form 
+#' @param seed random seed to use, is variable so that results can be replicated
+#'
+#' @return a tribble with two columns:
+#'    county_fips: unique identifier for each county
+#'    cluster: number corresponding to cluster county belongs to
+#'    
+#' @examples
+#' cdc.data %>%
+#'   cdc.mort.mat(input$state_choice, input$death_cause) %>%
+#'   diana.func(4)
+#'   
+#'   
+#' diana.func(cdc.mort.mat(cdc.data, "NY", "Despair"), 5)
+#'    
+#' @author Ross DeVito
+#' @export
+diana.func <- function(data.mat, cluster.num=3, seed=200) {
+  set.seed(seed)
+  data.mat <- na.omit(data.mat)
+
+  diana.tree <- diana(dplyr::select(data.mat, -county_fips), 
+                      metric = "euclidean", 
+                      stand = FALSE,
+                      stop.at.k = FALSE,
+                      trace.lev = 0)
+  diana.clusters <- cutree(as.hclust(diana.tree), k = cluster.num)
+  return(tibble(county.fips = data.mat$county_fips, cluster = diana.clusters))
+}
+
+
+# cluster.counties
+
+#' Clusters counties using some specified clustering method
+#' 
+#' @note implemented in this way so that if/when additional clustering method
+#'        is added, it can be added to function in an else if
+#'
+#' @param county.data data.frame in which one column is "county_fips" and other columns
+#'                     are data to cluster by
+#' @param cluster.method method to be used to cluster counties. One of set {"kmeans", "diana"}
+#' @param cluster.num optional argument used by clustering methods where you must provide
+#'                      desired number of clusters (e.g. kmeans)
+#' @param seed random seed to be used by clustering algorithm
+#'
+#' @return a tribble with (num counties in state) rows and two columns:
+#'    county_fips: unique identifier for each county
+#'    cluster: number corresponding to cluster county belongs to
+#'    
+#'    e.g.
+#'    # A tibble: 67 x 2
+#'    county_fips cluster
+#'    <chr>       <chr>  
+#'  1 01001       1      
+#'  2 01003       4      
+#'  3 01005       2      
+#'  4 01007       4      
+#'  5 01009       4      
+#'  6 01011       2      
+#'  7 01013       2      
+#'  8 01015       1      
+#'  9 01017       1      
+#'  10 01019       4      
+#'  # … with 57 more rows
+#'
+#' @examples
+#' state <- "AL"
+#' death.cause <- "Despair"
+#' cluster.counties(cdc.mort.mat(cdc.data, state, death.cause),
+#'                  cluster.method="kmeans",
+#'                  cluster.num=4)
+#' 
+#' 
+#' cluster.counties(cdc.mort.mat(cdc.data, input$state_choice, input$death_cause),
+#'                  cluster.method="diana",
+#'                  cluster.num=4)
+#' 
+#' @author Ross DeVito
+#' @export
+cluster.counties <- function(county.data, cluster.method="kmeans", cluster.num=4, seed=200) {
+  if (cluster.method == "kmeans"){
+    return(km.func(county.data, cluster.num, seed))
+  } else if (cluster.method == "diana") {
+    return(diana.func(county.data, cluster.num, seed))
+  }
+}
+
+#cdc.mort.mat
+
+#' Used in app.R and/or elsewhere
+#' 
+#' @note Importing and Conversion
+#'
+#' @param cdc.data.long
+#' @param state.abbr
+#' @param death.cause
+#'
+#' @return 
+#'
+#' @examples
+#' 
+#' @author John Erickson (based on others)
+#' @export
+
+cdc.mort.mat <- function(cdc.data.long, state.abbr, death.cause = "Despair") {
+
+  tmp <- cdc.data.long
+  if (state.abbr != "US") {
+    tmp <- dplyr::filter(cdc.data.long, state_abbr == state.abbr)
+  }else {
+    tmp <- cdc.data.long
+  }
+
+  dplyr::filter(tmp, death_cause == death.cause) %>%
+    tidyr::drop_na(county_fips) %>%
+    dplyr::select(county_fips, death_rate, period) %>%
+    tidyr::spread(key = period, value = death_rate) %>%
+    dplyr::arrange(county_fips)
+}
+
+
+# order.county.clusters
+
+#' Given a dataframe of counties and their original cluster and data.frame that can
+#'  be used to map from original cluster to ordered cluster, returns a data.frame that
+#'  gives counties with their ordered cluster
+#'
+#' @param county.clusters a data.frame or tribble in the form returned by cluster.counties()
+#' @param cluster.order.map a data.frame that has the original cluster names as the row names
+#'                           and the ordered clusters in a column "ord". Data.frames returned
+#'                           by get.cluster.order.map are of this form
+#'
+#' @return a tribble with (num counties in state) rows and two columns:
+#'    county_fips: unique identifier for each county
+#'    cluster: number corresponding to cluster county belongs to, ordered such that 1 has the
+#'              lowest deathrate the highest cluster number (the number of clusters) the highest
+#'              
+#'    Same format at cluster.counties(). See for example
+#'
+#' @examples
+#' state <- "AL"
+#' death.cause <- "Despair"
+#' county.clusters <- cluster.counties(cdc.mort.mat(cdc.data, state, death.cause),
+#'                                      cluster.method="kmeans",
+#'                                      cluster.num=4)
+#' cluster.deathrates <- get.cluster.deathrate.during.time(county.clusters, cdc.data, death.cause)
+#' cluster.order.map <- get.cluster.order.map(cluster.deathrates)
+#' county.clusters.ordered <- order.county.clusters(county.clusters, cluster.order.map)
+#' 
+#' 
+#' order.county.clusters(mort.cluster.raw(), mort.cluster.map())
+#' 
+#' @author Ross DeVito
+#' @export
+#' order.county.clusters(mort.cluster.raw(), mort.cluster.map())
+order.county.clusters <- function(county.clusters, cluster.order.map) {
+  return(
+    dplyr::mutate(county.clusters, cluster = cluster.order.map[cluster, "ord"])
+  )
+}