Analysis.Rmd

---
title: "Analysis: Microhabitats are associated with diversity-productivity relationships in freshwater bacterial communities" 
author: "Marian L. Schmidt, marschmi@umich.edu, @micro_marian"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    code_folding: show
    highlight: default
    keep_md: yes
    theme: journal
    toc: yes
    toc_float:
      collapsed: no
      smooth_scroll: yes
      toc_depth: 3
editor_options: 
  chunk_output_type: console
---
<style>
pre code, pre, code {
  white-space: pre !important;
  overflow-x: scroll !important;
  word-break: keep-all !important;
  word-wrap: initial !important;
}
</style>


#### Purpose of this file  

This file has all of the code for the the main analysis to reproduce all figures except Figure S9 in the Schmidt et al. manuscript entitled **"Microhabitats shape diversity-productivity relationships in freshwater bacterial communities"**, published by [FEMS Microbiology Ecology](https://doi.org/10.1093/femsec/fiaa029). 

*Note*: To see how to reproduce figure S9 please see the github for this project and go to [analysis/OTU_Removal_Analysis.html](https://deneflab.github.io/Diversity_Productivity/analysis/OTU_Removal_Analysis.html)  on the github page.

##### If you have any questions, please be welcome to email the corresponding author at marschmi@umich.edu or tweet her at [micro_marian](https://twitter.com/micro_marian?lang=en).

```{r setup, include=FALSE}
# For width of code chunks and scroll bar 
options(width=250)

knitr::opts_chunk$set(eval = TRUE, 
                      echo = TRUE, 
                      include = TRUE,
                      warning = FALSE,
                      collapse = FALSE,
                      message = FALSE,
                      dpi=600, #dev = "tiff",
                      engine = "R", # Chunks will always have R code, unless noted
                      error = TRUE,
                      fig.path="figures/",  # Set the figure options
                      fig.align = "center") 

```

# Load Libraries
```{r load-libraries}
library(devtools)   # Reproducibility (see end of file)
library(phyloseq)   # Easier data manipulation  
library(picante)    # Phylogenetic Tree Analysis   
library(tidyverse)  # Pretty plotting and data manipulation 
library(forcats)    # Recoding factors
library(cowplot)    # Multiple plotting
library(picante)    # Will also include ape and vegan  
library(car)        # Residual analysis 
library(sandwich)   # vcovHC function in post-hoc test 
library(MASS)       # studres in plot_residuals function    
library(caret)      # Cross validation
library(pander)     # Pretty tables      
library(glmnet)     # Lasso regressions 
library(broom)      # Stats from lm regression 
library(purrr)      # Exporting lm results  
library(DT)         # Fancy HTML table output
library(lme4)       # linear mixed effects model
library(iNEXT)      # Calculate hill diversity with interpolation & extrapolation
library(hillR)      # Calculate phylogenetic hill numbers
library(hilldiv)    # Calculate hill phylogenetic hill numbers (for comparison)
library(phytools)   # for force.ultrametric() 
source("code/Muskegon_functions.R")   # Source custom functions  
source("code/set_colors.R")           # Set Colors for plotting

# Set the ggplot theme 
theme_set(theme_cowplot())
```


# Load data
```{r load-data}
# Loads a phyloseq object named otu_merged_musk_pruned)
load("data/surface_PAFL_otu_pruned_raw.RData") 
# The name of the phyloseq object is: 
surface_PAFL_otu_pruned_raw 

# Remove doubletons
surface_PAFL_otu_pruned_rm2 <- prune_taxa(taxa_sums(surface_PAFL_otu_pruned_raw) > 2, surface_PAFL_otu_pruned_raw) 
surface_PAFL_otu_pruned_rm2

# How many OTUs left for analysis?
paste("There are", ntaxa(surface_PAFL_otu_pruned_rm2), "OTUs remaining for analysis in the dataset.")

# Remove tree for computational efficiency 
surface_PAFL_otu_pruned_notree_rm2 <- phyloseq(tax_table(surface_PAFL_otu_pruned_rm2), otu_table(surface_PAFL_otu_pruned_rm2), sample_data(surface_PAFL_otu_pruned_rm2)) 

# Gather the metadata in a dataframe
metadata <- data.frame(sample_data(surface_PAFL_otu_pruned_notree_rm2)) %>%
      mutate(fraction = factor(fraction, levels = c("WholePart","WholeFree")),
         lakesite = factor(lakesite,  levels = c("MOT", "MDP", "MBR", "MIN")),
         fraction = fct_recode(fraction, Particle = "WholePart", Free = "WholeFree"),
         lakesite = fct_recode(lakesite, Outlet = "MOT", Deep = "MDP", Bear = "MBR", River = "MIN"))
row.names(metadata) <- metadata$norep_filter_name

# Replace the sample data 
sample_data(surface_PAFL_otu_pruned_notree_rm2) <- metadata

# Create a dataframe for environmental data only
environmental_data <- metadata %>%
  dplyr::select(Temp_C:DO_percent, -BGA_cellspermL, -SRP_ugperL, fraction, norep_filter_name, -DO_percent) %>%
  dplyr::filter(fraction == "Free") %>%
  dplyr::select(-fraction) %>%
  tibble::column_to_rownames(var = "norep_filter_name") %>%
  dplyr::rename(Temp = Temp_C, SPC = SpCond_uSpercm,
         TDS = TDS_mgperL, ORP = ORP_mV,
         Chla = Chl_Lab_ugperL, Cl = Cl_mgperL,
         SO4 = SO4_mgperL, NO3 = NO3_mgperL,
         NH3 = NH3_mgperL, TKN = TKN_mgperL,
         TP = TP_ugperL, Alk = Alk_mgperL,
         DO = DO_mgperL) %>%
  as.matrix()
```

```{r pca-environ}
#### PCA DATA 
# Scale the data so their variances are the same!
scaled_enviro <- scale(environmental_data)
# Sanity Checks: check that we get mean of 0 and sd of 1
apply(scaled_enviro, 2, mean)
apply(scaled_enviro, 2, sd)
# Run a redundnacy analysis (RDA) to extract the variation in the set of response variables 
pca_environ <- rda(scaled_enviro) 
# What is the proportion explained by each of the PCA axes? 
summary(pca_environ)$cont$importance
# Pull out all of the PCA scores into a dataframe
pca_scores_df <- summary(pca_environ)$sites %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "norep_filter_name") %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::select(-c(norep_filter_name, PC3, PC4, PC5, PC6))
# Combine the above dataframe with the rest of the metadata
metadata_pca <- metadata %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  left_join(pca_scores_df, by = "norep_water_name")
```


# Calculate Diversity

## Hill Diversity
**Hill diversity with the iNEXT package**
```{r iNEXT-data}
# Prepare the input data for iNEXT
iNEXT_input_table <- otu_table(surface_PAFL_otu_pruned_rm2) %>%
  t() %>%
  data.frame()
#set.seed(777)
# Run iNEXT on the data - it takes a long time to run! So here we will load in the object that was previously created with the following line of code:
#iNEXT_data <- iNEXT(iNEXT_input_table, q = c(0,1,2), datatype = "abundance")
#save(list="iNEXT_data", file=paste0("data/iNEXT/iNEXT_data_surface_PAFL_otu_pruned_rm2")) 
# Load the data
load("data/iNEXT/iNEXT_data_surface_PAFL_otu_pruned_rm2")
#str(iNEXT_data)
# Pull out into a dataframe
div_iNEXT <- iNEXT_data$AsyEst %>%
  rename(norep_filter_name = Site) 
```


## Phylo Hill Diversity
**Phylogenetic Hill diversity with the hillR package**
```{r calc-hillR}
# Here I will use hillR to calculate the phylogenetic diversity,
# which uses an extension of the hill numbers 
#otu_mat <- data.frame(otu_table(surface_PAFL_otu_pruned_rm2))
#phy_tree <- phy_tree(surface_PAFL_otu_pruned_rm2)
set.seed(111)
# Calculate the Hill phylogenetic diversity metric from the hillR package
# The output from this command is a named vector with the sample name and the div value.
#hill_phy0_df <- hill_phylo(comm = otu_mat, tree = phy_tree, q = 0)
#hill_phy1_df <- hill_phylo(comm = otu_mat, tree = phy_tree, q = 1)
#hill_phy2_df <- hill_phylo(comm = otu_mat, tree = phy_tree, q = 2)
# Pull out the sample names
#norep_filter_name <- names(hill_phy0_df) 
#stopifnot(names(hill_phy0_df) == names(hill_phy1_df))
#stopifnot(names(hill_phy1_df) == names(hill_phy2_df))
# Pull out the hill phylo div values
#div_sample_names <- rep(norep_filter_name, times = 3)
#phylo_div_vals <- c(unname(hill_phy0_df), unname(hill_phy1_df), unname(hill_phy2_df)) # rich, shannon, simpson
#phylo_div_types <- c(rep("Species richness", times = 24), rep("Shannon diversity", times = 24), rep("Simpson diversity", times = 24))
# Put it all in a dataframe 
#hillR_output <- data.frame(div_sample_names, phylo_div_vals, phylo_div_types) %>%
#  rename(norep_filter_name = div_sample_names, Diversity = phylo_div_types,
#         hillR_phylo = phylo_div_vals) 
# save to file because hillR is computationally intensive 
#write.csv(x = hillR_output, file = "data/iNEXT/hillR_output.csv", row.names = FALSE, quote = FALSE)
# Because hillR takes so much time to calculate - I'll load in a previously computed version (from the code above)
hillR_output <- read.csv("data/iNEXT/hillR_output.csv") %>%
  mutate(Diversity = fct_recode(Diversity, "phylo_richness" = "Species richness", 
                                "phylo_shannon" = "Shannon diversity", 
                                "phylo_simpson" = "Simpson diversity")) %>%
  spread(key = Diversity, value = hillR_phylo)
```

**Put together the diversity dataframes into one**
```{r combine-diversity-dfs-prep}
# COMBINE iNEXT AND hillR DATA - should now be 144 rows
div_df <- 
  div_iNEXT %>%
  dplyr::select(norep_filter_name, Diversity, Observed) %>%
  mutate(Diversity = fct_recode(Diversity, "richness" = "Species richness", 
                                "shannon" = "Shannon diversity", 
                                "simpson" = "Simpson diversity")) %>%
  # For this analysis, we will be using the observed values calculated from iNEXT
  spread(key = Diversity, value = Observed) %>%
  left_join(hillR_output)
```

## Mean Pairwise Distance
```{r calc-sesmpd, fig.width = 4, fig.height = 4}  
# Read in the tree 
#RAREFIED_rm2_fasttree <- read.tree(file = "data/PhyloTree/newick_tree_rm2_rmN.tre")
  
# Load in data that has doubletons removed 
#load("data/PhyloTree/surface_PAFL_otu_pruned_RAREFIED_rm2.RData")
#surface_PAFL_otu_pruned_RAREFIED_rm2
# Create the OTU table for picante 
#surface_PAFL_RAREFIED_rm2_otu <- matrix(otu_table(surface_PAFL_otu_pruned_RAREFIED_rm2), nrow = nrow(otu_table(surface_PAFL_otu_pruned_RAREFIED_rm2)))
#rownames(surface_PAFL_RAREFIED_rm2_otu) <- sample_names(surface_PAFL_otu_pruned_RAREFIED_rm2)
#colnames(surface_PAFL_RAREFIED_rm2_otu) <- taxa_names(surface_PAFL_otu_pruned_RAREFIED_rm2)
    
  ## Calculate input for SES_MPD  
# Convert the abundance data to standardized abundanced vegan function `decostand' , NOTE: method = "total"
#otu_decostand_total <- decostand(surface_PAFL_RAREFIED_rm2_otu, method = "total")
# check total abundance in each sample
#apply(otu_decostand_total, 1, sum)
# check for mismatches/missing species between community data and phylo tree
#RAREFIED_rm2_matches <- match.phylo.comm(RAREFIED_rm2_fasttree, otu_decostand_total)
# the resulting object is a list with $phy and $comm elements.  replace our
# original data with the sorted/matched data
#phy_RAREFIED_rm2 <- RAREFIED_rm2_matches$phy
#comm_RAREFIED_rm2 <- RAREFIED_rm2_matches$comm
# Calculate the phylogenetic distances
#phy_dist_RAREFIED_rm2 <- cophenetic(phy_RAREFIED_rm2)
## Calculate SES_MPD
###################################### INDEPENDENT SWAP ############################################
# calculate standardized effect size mean pairwise distance (ses.mpd)
#unweighted_sesMPD_indepswap_RAREFIED_rm2 <- ses.mpd(comm_RAREFIED_rm2, phy_dist_RAREFIED_rm2, null.model = "independentswap", 
#                                     abundance.weighted = FALSE, runs = 999)
#unweight_sesmpd <- unweighted_sesMPD_indepswap_RAREFIED_rm2 %>% rownames_to_column(var = "norep_filter_name")
#WEIGHTED_sesMPD_indepswap_RAREFIED_rm2 <- ses.mpd(comm_RAREFIED_rm2, phy_dist_RAREFIED_rm2, null.model = "independentswap", 
#                                     abundance.weighted = TRUE, runs = 999)
#weight_sesmpd <- WEIGHTED_sesMPD_indepswap_RAREFIED_rm2 %>% rownames_to_column(var = "norep_filter_name")
# Combine them together into a dataframe
#sesmpd_df <- unweight_sesmpd %>%
#  dplyr::select(norep_filter_name, mpd.obs.z) %>%
#  rename(unweighted_sesmpd = mpd.obs.z) %>%
#  left_join(dplyr::select(weight_sesmpd, norep_filter_name, mpd.obs.z), by = "norep_filter_name") %>%
#  rename(weighted_sesmpd = mpd.obs.z)
# Gather ALL div data!
#div_df <- div_df %>%
#  left_join(sesmpd_df, by = "norep_filter_name")
#write.csv(div_df, file = "data/iNEXT/diversity_measures.csv", quote = FALSE, row.names = FALSE)
```

#### Final dataframe
```{r combine-diversity-dfs-final}
div_df <- read.csv(file = "data/iNEXT/diversity_measures.csv")

# Create the final dataframe!
comb_div_df <- div_df %>%
  left_join(metadata_pca, by = "norep_filter_name") %>%
  gather(key = "Diversity", value = "Observed", richness:phylo_richness) %>%  
  data.frame()
head(comb_div_df)
dim(comb_div_df)

# #Prepare dataframe for calculations
div_df_frac <- metadata %>%
  dplyr::select(norep_filter_name, fraction, season) %>%
  right_join(div_iNEXT, by = "norep_filter_name")
## The following data will the be primary dataframe used in this analysis

# Wide format dataframe for plotting
wide_div_meta_df <- div_df %>%
  left_join(metadata_pca, by = "norep_filter_name")

### BASIC STATS OF PRODUCTION AND # OF CELL
std <- function(x) sd(x)/sqrt(length(x))

# For the beginning of the results section 
wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(avg_cells_per_mL = mean(fraction_bac_abund), 
            se_cells_per_mL = std(fraction_bac_abund), 
            avg_community_prod = mean(frac_bacprod),
            se_community_prod = std(frac_bacprod))
wide_div_meta_df %>%
  dplyr::filter(norep_filter_name != "MOTEJ515") %>%
  group_by(fraction) %>%
  summarize(avg_log10_percap_prod = mean(log10(fracprod_per_cell)),
            se_log10_percap_prod = std(log10(fracprod_per_cell)))
```


# Main Figures
**Have a legend for all plots**  
Since there's a set parameter in  `set_colors.R` file that was sourced at the beginning, we can use this throughout to can assure that the below legends will represent the legends called afterwards. 
```{r legends}
####### Legend for season
legend_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) + 
  geom_jitter(size = 3, width = 0.2) + 
  scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
  scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
        legend.title = element_blank(), legend.justification="center",
        plot.margin = unit(c(0.2,0.2,1,0.2), "cm")) # top, right, bottom, and left margins
# Extract the legend
season_legend <- cowplot::get_legend(legend_plot + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))


### Make it 2 rows for plots that are skinnier
####### Legend for season 
season_2row_plot <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = season)) + 
  geom_jitter(size = 3, width = 0.2) + 
  scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
  scale_shape_manual(values = season_shapes, guide = guide_legend(override.aes = list(size = 4))) +
  theme(legend.position = "bottom",  legend.justification="center",
        axis.title.x = element_blank(),
        legend.title = element_blank(), legend.box = "vertical",
        legend.spacing.y = unit(0.1, 'cm'),
        plot.margin = unit(c(0.2,0.2,1,0.2), "cm")) # top, right, bottom, and left margins
# Extract the legend
season_2row_legend <- cowplot::get_legend(season_2row_plot + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))

####### Legend for lakesite
lakesite_legend <- ggplot(metadata, aes(y = frac_bacprod, x = fraction, fill = fraction, shape = lakesite)) + 
  geom_jitter(size = 3, width = 0.2) + 
  scale_fill_manual(values = fraction_colors, guide = guide_legend(override.aes = list(shape = 22, size = 4))) +
  scale_shape_manual(values = lakesite_shapes, guide = guide_legend(override.aes = list(size = 4))) +
  theme(legend.position = "bottom", axis.title.x = element_blank(),
        legend.title = element_blank(), legend.justification="center",
        plot.margin = unit(c(0.2,0.2,1,0.2), "cm")) # top, right, bottom, and left margins
# Extract the legend
lakesite_legend <-  cowplot::get_legend(lakesite_legend + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))
```


## Figure 1
```{r Figure-1, fig.width = 8.5, fig.height = 3.5}
######################################################### Fraction ABUNDANCe 
frac_abund_wilcox <- wilcox.test(log10(as.numeric(fraction_bac_abund)) ~ fraction, data = metadata)
frac_abund_wilcox

metadata %>%
  group_by(fraction) %>%
  summarize(mean(as.numeric(fraction_bac_abund)))

# Make a data frame to draw significance line in boxplot (visually calculated)
dat1 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(6.45,6.5,6.5,6.45)) # WholePart vs WholeFree

poster_a <- 
  ggplot(filter(metadata, norep_filter_name != "MOTEJ515"), 
       aes(y = log10(as.numeric(fraction_bac_abund)), x = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  #ylab("Log10(Bacterial Counts) \n (cells/mL)") +
  ylab(expression(atop(log[10]*"(Bacterial Counts)", "(cells/mL)"))) + 
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  ##### Particle vs free cell abundances 
  geom_path(data = dat1, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=6.55, size = 8, color = "gray40", label= paste("***")) +
  annotate("text", x=1.5, y=6.4, size = 3.5, color = "gray40",
           label= paste("p =", round(frac_abund_wilcox$p.value, digits = 6))) +
  theme(legend.position = "bottom",
        legend.title = element_blank(),
        axis.title.x = element_blank())

######################################################### TOTAL PRODUCTION 
totprod_wilcox <- wilcox.test(frac_bacprod ~ fraction, data = metadata)
totprod_wilcox

metadata %>%
  group_by(fraction) %>%
  summarize(mean(frac_bacprod))

# Make a data frame to draw significance line in boxplot (visually calculated)
dat2 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(71,72,72,71)) # WholePart vs WholeFree

poster_b <- ggplot(metadata, aes(y = frac_bacprod, x = fraction)) + 
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  scale_y_continuous(limits = c(0, 78), expand = c(0,0), breaks = seq(0, 75, by = 15)) +
  #ylab("Community Production \n (μgC/L/day)") +
  ylab(expression(atop("Community Production", "(μgC/L/day)"))) + 
  ##### Particle vs free bulk production  
  geom_path(data = dat2, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=73, size = 8, color = "gray40", label= paste("*")) +
  annotate("text", x=1.5, y=69, size = 3.5, color = "gray40",
           label= paste("p =", round(totprod_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",
        axis.title.x = element_blank())

######################################################### TOTAL PRODUCTION 
percellprod_wilcox <- wilcox.test(log10(fracprod_per_cell) ~ fraction, data = filter(metadata, norep_filter_name != "MOTEJ515"))
percellprod_wilcox

filter(metadata, norep_filter_name != "MOTEJ515") %>%
  group_by(fraction) %>%
  summarize(mean(fracprod_per_cell))

# Make a data frame to draw significance line in boxplot (visually calculated)
dat3 <- data.frame(a = c(1.1,1.1,1.9,1.9), b = c(-5.05,-5,-5,-5.05)) # WholePart vs WholeFree

line_2c <- expression("log" ~~ sqrt(x, y) ~~ "or this" ~~ sum(x[i], i==1, n) ~~ "math expression")


poster_c <- ggplot(filter(metadata, norep_filter_name != "MOTEJ515"), 
       aes(y = log10(fracprod_per_cell), x = fraction)) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  ylim(c(-8.5, -4.9)) + 
  ylab(expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) + 
  #ylab("Log10(Per-Capita Production) \n(μgC/cell/day)") +
  ##### Particle vs free per-cell production 
  geom_path(data = dat3, aes(x = a, y = b), linetype = 1, color = "gray40") +
  annotate("text", x=1.5, y=-4.95, size = 8, color = "gray40", label= paste("***")) +
  annotate("text", x=1.5, y=-5.15,  size = 3.5, color = "gray40",
           label= paste("p =", round(percellprod_wilcox$p.value, digits = 5))) +
  theme(legend.position = "none",
        axis.title.x = element_blank())

######## FIGURE 1
row1_plots <- plot_grid(poster_a + theme(legend.position = "none"), poster_b, poster_c,
          labels = c("A", "B", "C"),
          ncol = 3, nrow = 1)
fig_1 <- plot_grid(row1_plots, season_legend,
           ncol = 1, nrow = 2, 
           rel_heights = c(1, 0.08)); fig_1

# What are the averages? 
metadata %>%
  group_by(fraction) %>%
  dplyr::select(frac_bacprod, fracprod_per_cell_noinf) %>%
  na.omit() %>%
  summarize(avg_comm_prod = mean(frac_bacprod),
            avg_percell_prod = mean(fracprod_per_cell_noinf),
            log10_avg_percell_prod = log10(avg_percell_prod))
```

## Figure 2
```{r Figure-2, fig.width=9, fig.height=3.2}
site_div_order <- c("richness","shannon","simpson","unweighted_sesmpd",
                    "phylo_richness", "phylo_shannon","phylo_simpson", "weighted_sesmpd") 

site_div_labs <- c("{}^0*italic(D)", "{}^1*italic(D)","{}^2*italic(D)", "italic(Unweighted_MPD)",
                   "{}^0*italic(PD)", "{}^1*italic(PD)","{}^2*italic(PD)", "italic(Weighted_MPD)")

long_div_meta_df <- 
  wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name, richness:weighted_sesmpd, fraction, lakesite, season)) %>%
  gather(key = Diversity, value = div_val, richness:weighted_sesmpd) %>%
  mutate(Diversity = fct_relevel(Diversity, levels = site_div_order),
         hill_labels = factor(Diversity, labels = site_div_labs))


wc_pafl_pvals1 <-  c(wilcox.test(richness ~ fraction, data = wide_div_meta_df)$p.value,
                    wilcox.test(shannon ~ fraction, data = wide_div_meta_df)$p.value, 
                    wilcox.test(simpson ~ fraction, data = wide_div_meta_df)$p.value, 
                    wilcox.test(unweighted_sesmpd ~ fraction, data = wide_div_meta_df)$p.value,
                    # Row 2 
                    wilcox.test(phylo_richness ~ fraction, data = wide_div_meta_df)$p.value, 
                    wilcox.test(phylo_shannon ~ fraction, data = wide_div_meta_df)$p.value,
                    wilcox.test(phylo_simpson ~ fraction, data = wide_div_meta_df)$p.value, 
                    wilcox.test(weighted_sesmpd ~ fraction, data = wide_div_meta_df)$p.value)

pafl_pvals_adjusted <- p.adjust(wc_pafl_pvals1, method = "fdr")

wc_pafl_pvals <-  c(paste("p=", round(pafl_pvals_adjusted[1], digits = 3), sep = ""),
                    paste("p=", round(pafl_pvals_adjusted[2], digits = 3), sep = ""),
                    paste("N", "S", sep = ""),
                    paste("p=", round(pafl_pvals_adjusted[4], digits = 2), sep = ""),
                    # Row 2 
                    paste("p=", round(pafl_pvals_adjusted[5], digits = 3),sep = ""),
                    paste("p=", round(pafl_pvals_adjusted[6], digits = 3), sep = ""),
                    paste("p=", round(pafl_pvals_adjusted[7], digits = 3), sep = ""),
                    paste("N", "S", sep = ""))

fraction <- as.character(rep("Free", 8))

# Maximum Diversity Values by Fraction 
wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(max_rich = max(richness), max_shan = max(shannon), max_simps = max(simpson), max_umpd = max(unweighted_sesmpd), 
            # row 2
            max_phyrich = max(phylo_richness), max_physhan = max(phylo_shannon), max_physimps = max(phylo_simpson), max_wmpd = max(weighted_sesmpd))

# Minimum Diversity Values by Fraction 
wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(min_rich = min(richness), min_shan = min(shannon), min_simps = min(simpson), max_umpd = min(unweighted_sesmpd), 
            # row 2
            max_phyrich = min(phylo_richness), max_physhan = min(phylo_shannon), max_physimps = min(phylo_simpson), max_wmpd = min(weighted_sesmpd))

# Locations of labels 
maxes <- c(1350, 299, 78, 1.35, 150, 15.2, 5.8, 0.1)
df_temp <- data.frame(hill_labels = site_div_labs, wc_pafl_pvals, fraction, maxes) 

fig2 <- long_div_meta_df %>% 
  dplyr::filter(Diversity %in% c("richness", "shannon", "simpson", "unweighted_sesmpd")) %>%
  ggplot(aes(y = div_val, x = fraction, fill = fraction)) +
  ylab("Value") +
  facet_wrap(hill_labels~., scales = "free", labeller = label_parsed, nrow = 2, ncol = 4) + 
  geom_point(size = 3, position = position_jitterdodge(), aes(shape = season)) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black") +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank()) +
  geom_text(data = dplyr::filter(df_temp, hill_labels %in% c("{}^0*italic(D)", "{}^1*italic(D)","{}^2*italic(D)", "italic(Unweighted_MPD)")), 
            aes(x = fraction, y = maxes, label = wc_pafl_pvals), size = 4.5, hjust = 0.5,color = "grey40") 

plot_grid(fig2, season_legend, nrow =2, ncol =1, rel_heights = c(1, 0.08))
```

### Linear Models 
**Prepare data frames for table S1 and S2**
```{r prep-Table-S1-S2}
## Prepare the dataframes for Free and Particle data
metadata_pca_PD_free <- dplyr::filter(wide_div_meta_df, fraction == "Free")
metadata_pca_PD_part <-  dplyr::filter(wide_div_meta_df, fraction == "Particle")  

# List of environmental variables to run linear models on 
        # BGA_cellspermL and Turb_NTU do not have enough data points for regression
lm_variables <- c("frac_bacprod", "fracprod_per_cell_noinf", "Temp_C", "SpCond_uSpercm", "TDS_mgperL", 
                  "pH", "ORP_mV", "Chl_Lab_ugperL", "Cl_mgperL", "SO4_mgperL", "NO3_mgperL", 
                  "NH3_mgperL", "TKN_mgperL", "SRP_ugperL", "TP_ugperL", "Alk_mgperL", "DO_mgperL", 
                  "total_bac_abund","attached_bac", "dnaconcrep1", 
                  "PC1", "PC2", 
                  "richness", "shannon", "simpson", "phylo_richness", "phylo_shannon", "phylo_simpson",
                  "unweighted_sesmpd", "weighted_sesmpd") 

########################################################
##############  Particle: Community-wide 
lm_summary_particle_comm <- metadata_pca_PD_part %>%
  dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
  gather(key = independent, value = measurement, -frac_bacprod) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(frac_bacprod ~ measurement, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Particle: Per-capita  
lm_summary_particle_percap <- metadata_pca_PD_part %>%
  dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
  filter(!is.na(fracprod_per_cell_noinf)) %>%   # Remove samples that are not NA
  gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 


########################################################
##############  Free: Community-wide 
lm_summary_free_comm <- metadata_pca_PD_free %>%
  dplyr::select(one_of(lm_variables), -fracprod_per_cell_noinf) %>%
  gather(key = independent, value = measurement, -frac_bacprod) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(frac_bacprod ~ measurement, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Free: Per-capita  
lm_summary_free_percap <- metadata_pca_PD_free %>%
  dplyr::select(one_of(lm_variables), -frac_bacprod) %>%
  filter(!is.na(fracprod_per_cell_noinf)) %>%   # Remove samples that are not NA
  gather(key = independent, value = measurement, -fracprod_per_cell_noinf) %>%
  mutate(measurement = as.numeric(measurement)) %>%
  group_by(independent) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ measurement, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, independent, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 


## Filtered PCA Linear Model Results
# Put all of the PCA and environmental  linear model results together into one dataframe 
all_pca_div_environ_lm_results <- 
  bind_rows(lm_summary_particle_comm, lm_summary_particle_percap,
          lm_summary_free_comm, lm_summary_free_percap) %>%
  dplyr::rename(independent_var = independent,
                dependent_var = dependent) 

## Combine the results with the diversity linear models 
individual_lm_results <- all_pca_div_environ_lm_results %>%
  mutate(AIC = round(AIC, digits = 2),  
         adj.r.squared = round(adj.r.squared, digits = 2),
         p.value = round(p.value, digits = 4),
         FDR.p = round(FDR.p, digits = 4)) %>%
  rename(FDR.p.value = FDR.p)
```

```{r linear-models}
# Free-Living samples only 
div_measures_free <- comb_div_df %>%
  dplyr::select(fraction, Observed, Diversity, frac_bacprod, fracprod_per_cell_noinf) %>%
  filter(fraction == "Free") %>%
  dplyr::select(-fraction)
# Particle-associated samples only 
div_measures_part <- comb_div_df %>%
  dplyr::select(fraction, Observed, Diversity, frac_bacprod, fracprod_per_cell_noinf) %>%
  filter(fraction == "Particle") %>%
  dplyr::select(-fraction)

## All of the samples!
div_measures_all <- comb_div_df %>%
  dplyr::select(Observed, Diversity, frac_bacprod, fracprod_per_cell_noinf)

########################################################
##############  Particle: Community-wide 
lm_divs_particle_comm <- div_measures_part %>%
  dplyr::select(-fracprod_per_cell_noinf) %>%
  group_by(Diversity) %>% 
  do(glance(lm(frac_bacprod ~ Observed, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 


##############  Particle: Community-wide 
lm_divs_particle_percap <- div_measures_part %>%
  dplyr::select(-frac_bacprod) %>%
  group_by(Diversity) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ Observed, data = .))) %>% 
  mutate(fraction = "Particle", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

########################################################
##############  Free: Community-wide 
lm_divs_free_comm <- div_measures_free %>%
  dplyr::select(-fracprod_per_cell_noinf) %>%
  group_by(Diversity) %>% 
  do(glance(lm(frac_bacprod ~ Observed, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Community Production") %>%
  dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

##############  Free: Community-wide 
lm_divs_free_percap <- div_measures_free %>%
  dplyr::select(-frac_bacprod) %>%
  group_by(Diversity) %>% 
  do(glance(lm(log10(fracprod_per_cell_noinf) ~ Observed, data = .))) %>% 
  mutate(fraction = "Free", dependent = "Per-Capita Production") %>%
  dplyr::select(fraction, dependent, Diversity, logLik, AIC, adj.r.squared, p.value) %>%
  ungroup() %>%
  mutate(FDR.p = p.adjust(p.value, method = "fdr")) 

# Community-Wide Production    
table_comm_wide <- bind_rows(lm_divs_particle_comm, lm_divs_free_comm) %>%
  dplyr::select(-dependent) %>%
  mutate(AIC = round(AIC, digits = 1), 
         adj.r.squared = round(adj.r.squared, digits = 3), 
         p.value = round(p.value, digits = 3), 
         FDR.p = round(FDR.p, digits = 3))
datatable(table_comm_wide,
       caption = "Ordinary least squares regression statistics for community-wide heterotrophic production", 
       options = list(pageLength = 12), rownames = FALSE)   


# Log10 Per-capita Production    
table_percap <- bind_rows(lm_divs_particle_percap, lm_divs_free_percap) %>%
  dplyr::select(-dependent) %>%
  mutate(AIC = round(AIC, digits = 1), 
         adj.r.squared = round(adj.r.squared, digits = 3), 
         p.value = round(p.value, digits = 3), 
         FDR.p = round(FDR.p, digits = 3))
datatable(table_percap,
       caption = "Ordinary least squares regression statistics for log10(per-capita production)", 
       options = list(pageLength = 12), rownames = FALSE)   


# Here I will calculate the lms so later we can simply use these objects for plotting.

######################################################### COMMUNITY WIDE PRODUCTION VS DIVERSITY 
######################################################### COMMUNITY WIDE PRODUCTION VS DIVERSITY 
# linear model for community production vs species richness
lm_prod_vs_rich_PA <- lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Particle"))
lm_prod_vs_rich_FL <- lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Free"))

# linear model for community production vs exp(H') 
lm_prod_vs_shannon_PA <- lm(frac_bacprod ~ shannon, data = filter(wide_div_meta_df, fraction == "Particle"))
lm_prod_vs_shannon_FL <- lm(frac_bacprod ~ shannon, data = filter(wide_div_meta_df, fraction == "Free"))

# linear model for community production vs inverse simpson 
lm_prod_vs_invsimps_PA <- lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle"))
lm_prod_vs_invsimps_FL <- lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Free"))


########################  PHYLOGENETIC HILL DIVERSITY METRICS ################################ 
# 0D: linear model for community production vs phylogenetic  richness
lm_prod_vs_phylorich_PA <- lm(frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Particle"))
lm_prod_vs_phylorich_FL <- lm(frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Free"))

# 1D: linear model for community production vs phylogenetic exp(H') 
lm_prod_vs_phyloshan_PA <- lm(frac_bacprod ~ phylo_shannon, data = filter(wide_div_meta_df, fraction == "Particle"))
lm_prod_vs_phyloshan_FL <- lm(frac_bacprod ~ phylo_shannon, data = filter(wide_div_meta_df, fraction == "Free"))

# 2D: linear model for community production vs phylogenetic  inverse simpson 
lm_prod_vs_phylosimps_PA <- lm(frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Particle"))
lm_prod_vs_phylosimps_FL <- lm(frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Free"))

######################################################### PER CAPITA PRODUCTION VS DIVERSITY 
######################################################### PER CAPITA PRODUCTION VS DIVERSITY 
################
# linear model for community production vs species richness
lm_percell_prod_vs_rich_PA <- lm(log10(fracprod_per_cell_noinf) ~ richness, 
                                 data = filter(wide_div_meta_df, fraction == "Particle")); 
lm_percell_prod_vs_rich_FL <- lm(log10(fracprod_per_cell_noinf) ~ richness, 
                                 data = filter(wide_div_meta_df, fraction == "Free")); 

# linear model for community production vs exp(H') 
lm_percell_prod_vs_shannon_PA <- lm(log10(fracprod_per_cell_noinf) ~ shannon, 
                                    data = filter(wide_div_meta_df, fraction == "Particle"))
lm_percell_prod_vs_shannon_FL <- lm(log10(fracprod_per_cell_noinf) ~ shannon, 
                                    data = filter(wide_div_meta_df, fraction == "Free"))

# linear model for community production vs inverse simpson 
lm_percell_prod_vs_invsimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ simpson, 
                                     data = filter(wide_div_meta_df, fraction == "Particle"))
lm_percell_prod_vs_invsimps_FL <- lm(log10(fracprod_per_cell_noinf) ~ simpson, 
                                     data = filter(wide_div_meta_df, fraction == "Free"))

########################  PHYLOGENETIC HILL DIVERSITY METRICS ################################ 
# linear model for community production vs phylogenetic richness
lm_percell_prod_vs_phylorich_PA <- lm(log10(fracprod_per_cell_noinf) ~ phylo_richness, 
                                 data = filter(wide_div_meta_df, fraction == "Particle")); 
lm_percell_prod_vs_phylorich_FL <- lm(log10(fracprod_per_cell_noinf) ~ phylo_richness, 
                                 data = filter(wide_div_meta_df, fraction == "Free")); 

# linear model for community production vs phylogenetic exp(H') 
lm_percell_prod_vs_phyloshan_PA <- lm(log10(fracprod_per_cell_noinf) ~ phylo_shannon, 
                                    data = filter(wide_div_meta_df, fraction == "Particle"))
lm_percell_prod_vs_phyloshan_FL <- lm(log10(fracprod_per_cell_noinf) ~ phylo_shannon, 
                                    data = filter(wide_div_meta_df, fraction == "Free"))

# linear model for community production vs inverse simpson 
lm_percell_prod_vs_phylosimps_PA <- lm(log10(fracprod_per_cell_noinf) ~ phylo_simpson, 
                                     data = filter(wide_div_meta_df, fraction == "Particle"))
lm_percell_prod_vs_phylosimps_FL <- lm(log10(fracprod_per_cell_noinf) ~ phylo_simpson, 
                                     data = filter(wide_div_meta_df, fraction == "Free"))
```


## Figure 3
**Prepare Figure 3**
```{r prep-Figure-3}
# FDR Correction 
fig3_pvals <- c(summary(lm_prod_vs_rich_PA)$coefficients[,4][2], summary(lm_percell_prod_vs_rich_PA)$coefficients[,4][2],
                summary(lm_prod_vs_invsimps_PA)$coefficients[,4][2],summary(lm_percell_prod_vs_invsimps_PA)$coefficients[,4][2]);
fig3_adjusted_pvals <- p.adjust(fig3_pvals, method = "fdr")

######################################################### OBSERVED RICHNESS ######################################################### 
rich_fraction_wilcox <- wilcox.test(richness ~ fraction, data = wide_div_meta_df)
rich_fraction_wilcox

wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(mean(richness), sd(richness), min(richness), max(richness))

# Make a data frame to draw significance line in boxplot (visually calculated)
rich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(1350, 1400, 1400, 1350)) # WholePart vs WholeFree

rich_distribution_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(y = richness, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(150,1500), breaks = seq(from = 0, to =1500, by = 300)) + 
  labs(y = expression({}^0*italic(D)),
       x = "Fraction") +
  scale_shape_manual(values = season_shapes) +
  geom_path(data = rich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=1500, size = 8, color = "#424645", label= paste("***"), angle = 90) +
  annotate("text", x=1.5, y=1150, size = 4, color = "#424645",
           label= paste("p =", round(rich_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Richness vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_rich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(fig3_adjusted_pvals[1]), digits = 3), ")")

## 2. Plot Richness vs Community-wide (Per-Liter) Production 
prod_vs_rich_plot <-  
  wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Species richness"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=richness, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = richness - se_iNEXT, xmax = richness + se_iNEXT, 
                     color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, 
                    color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^0*italic(D)),
       y = expression(atop("Community Production", "(μgC/L/day)"))) + 
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), 
              method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(150,1500), breaks = seq(from = 0, to =1500, by = 300)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
   # Add the R2 and p-value to the plot 
  annotate("text", x=1200, y=50, label=lm_lab_perliter_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") # axis.title.x = element_blank(), axis.text.x = element_blank() 

# Summary stats 
summary(lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(frac_bacprod ~ richness, data = filter(wide_div_meta_df, fraction == "Particle" & richness < 1000)))

################ Richness vs Per Capita (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_rich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_rich_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(fig3_adjusted_pvals[2]), digits = 3), ")")

## 2. Plot Richness vs Per Capita (Per-Cell) Production 
percell_prod_vs_rich_plot <- 
  wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Species richness"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=richness, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = richness - se_iNEXT, xmax = richness + se_iNEXT, 
                     color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5,  color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^0*italic(D)),
       y = expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) + 
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), 
              method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(150,1500), breaks = seq(from = 0, to =1500, by = 300)) + 
  scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) + 
  #scale_y_continuous(limits = c(-8e-7,5e-6), breaks = seq(from = 0, to = 6e-7, by = 3e-7)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=1200, y=-7.5, label=lm_lab_percell_rich_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

# Summary stats 
summary(lm(log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(log10(fracprod_per_cell_noinf) ~ richness, data = filter(wide_div_meta_df, fraction == "Particle" & richness < 1000)))


######################################################### 2D = SIMPSON DIVERSITY (INVERSE SIMPSON)
######################################################### 2D = SIMPSON DIVERSITY (INVERSE SIMPSON)
invsimps_fraction_wilcox <- wilcox.test(simpson ~ fraction, data = wide_div_meta_df)
invsimps_fraction_wilcox

wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(mean(simpson), sd(simpson), min(simpson), max(simpson))

# Make a data frame to draw significance line in boxplot (visually calculated)
invsimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(83,85,85,83)) # WholePart vs WholeFree

invsimps_distribution_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(y = simpson, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  geom_jitter(size = 3,  aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  labs(y = expression({}^2*italic(D)),
       x = "Fraction") +
  geom_path(data = invsimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=80, size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none", #axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


################ Inverse Simpson vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(fig3_adjusted_pvals[3]), digits = 3), ")")

## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production 
prod_vs_invsimps_plot <-  
   wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Simpson diversity"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=simpson, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = simpson - se_iNEXT, xmax = simpson + se_iNEXT, 
                     color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^2*italic(D)),
       y = expression(atop("Community Production", "(μgC/L/day)"))) + 
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") #axis.title.x = element_blank(), axis.text.x = element_blank() 

# Summary stats 
summary(lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(frac_bacprod ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle" & simpson < 70)))


################ Inverse Simpson vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_invsimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_invsimps_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(fig3_adjusted_pvals[4]), digits = 3), ")")

## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production 
percell_prod_vs_invsimps_plot <- 
  wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Simpson diversity"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=simpson, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = simpson - se_iNEXT, xmax = simpson + se_iNEXT, 
                     color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_point(size = 3.5, color = "black", aes(shape = season, fill = fraction)) +
  scale_color_manual(values = fraction_colors, guide = TRUE) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^2*italic(D)),
       y = expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) + 
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=-7.5, label=lm_lab_percell_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  guides(color = guide_legend(override.aes = list(shape = 15))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

# Summary stats 
summary(lm(log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(log10(fracprod_per_cell_noinf) ~ simpson, data = filter(wide_div_meta_df, fraction == "Particle" & simpson < 70)))
```

**Plot figure 3**
```{r Figure-3, fig.width=7, fig.height=6}
# All plots together 
fig3_top <- plot_grid(prod_vs_rich_plot + theme(plot.margin = unit(c(0.2,0.2,0.2,0.8), "cm")), # t, r, b, l
           prod_vs_invsimps_plot, 
           percell_prod_vs_rich_plot + theme(legend.position = "none"),
           percell_prod_vs_invsimps_plot + theme(legend.position = "none"), 
           labels = c("A", "B", "C", "D"), ncol = 2, nrow = 2,
           rel_heights = c(1, 1), rel_widths = c(1,1),
           align = "hv")

# add legend 
plot_grid(fig3_top, season_legend,
                   ncol = 1, nrow = 2, 
                   rel_heights = c(1, 0.05))

```

**Graphical Abstract**
```{r graphical-abstract, fig.width=3.75, fig.height=3.5}
wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Simpson diversity"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=simpson, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = simpson - se_iNEXT, xmax = simpson + se_iNEXT, 
                     color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, shape = 22, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  labs(x = expression(textstyle("Inverse Simpson or")~{}^2*italic(D)),
       y = "Community Production \n (μgC/L/day)") + 
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,85), breaks = seq(from = 0, to = 85, by = 20)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
  # Add the R2 and p-value to the plot 
  annotate("text", x=70, y=45, label=lm_lab_perliter_invsimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = c(0.68, 0.90), legend.title = element_blank()) #axis.title.x = element_blank(), axis.text.x = element_blank() 
```


## Figure 4
**Phylodiv vs ses.MPD**
```{r Figure-4, fig.width=3.5, fig.height=4.2}
# Is there a relationship between phylogenetic richness (0D) and the unweighted mean pairwise distance?
# 1. Since it's a general correlation - the linear model will be for 
lm_unweightMPD_phylorich <- lm(unweighted_sesmpd ~ phylo_richness, data = wide_div_meta_df)
## 2. Extract the R2 and p-value from the linear model: 
lm_lab_unweightMPD_phylorich <- paste("atop(R^2 ==", round(summary(lm_unweightMPD_phylorich)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(summary(lm_unweightMPD_phylorich)$coefficients[,4][2]), digits = 3), ")")
## 3. Plot 
phy_MPD_p0 <- wide_div_meta_df %>%
  ggplot(aes(x = phylo_richness, y = unweighted_sesmpd, fill = fraction)) +
  geom_point(aes(shape = season), size = 3) + 
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^0*italic(PD)),
       y = "Unweighted MPD") +
  geom_smooth(method='lm', color = "black", fill = "grey") +   
  # Add the R2 and p-value to the plot 
  annotate("text", x=140, y=1, label=lm_lab_unweightMPD_phylorich, parse = TRUE, color = "black", size = 4) + 
  theme(legend.position = "none")

plot_grid(phy_MPD_p0, season_2row_legend, rel_heights = c(1, 0.1),
          nrow = 2, ncol = 1)

```

## Lasso Regressions
**Prepare data for lasso**
```{r prep-data-lasso}
# The lasso regression needs datasets that are wide formatted
lasso_cols_remove <- c("project", "year", "Date", "limnion", "dnaconcrep1", "perc_attached_bacprod",
                       "SD_perc_attached_bacprod", "SE_total_bac_abund", "SE_perc_attached_abund", "SE_attached_bac",
                       "tot_bacprod", "SD_tot_bacprod","SD_frac_bacprod", "BGA_cellspermL", "Turb_NTU", 
                       "lakesite", "season", "station")

### PARTICLE DATA = 12 samples 
# Note: Even though in the inclusion of MOTEJ515 doesn't have the bacterial counts, 
# the results from this lasso regression does not change. 
lasso_data_df_particle <- wide_div_meta_df %>%
  filter(fraction == "Particle" ) %>%
  dplyr::select(-c(fraction, lasso_cols_remove)) 

lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
  dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, norep_filter_name, norep_water_name)) 
  
percell_lasso_data_df_particle_noprod <- lasso_data_df_particle %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod)) %>%
  dplyr::filter(norep_filter_name != "MOTEJ515") %>% # Remove the missing row of data :( )
  dplyr::select(-c(norep_filter_name, norep_water_name))

### FREE DATA 
# Note: Even though in the inclusion of MOTEJ515 doesn't have the bacterial counts, 
# the results from this lasso regression does not change. 
lasso_data_df_free <- wide_div_meta_df %>%
  filter(fraction == "Free" ) %>%
  dplyr::select(-c(fraction, lasso_cols_remove)) 

lasso_data_df_free_noprod <- lasso_data_df_free %>%
  dplyr::select(-c(fracprod_per_cell, fracprod_per_cell_noinf, norep_filter_name, norep_water_name)) 

percell_lasso_data_df_free_noprod <- lasso_data_df_free %>%
  dplyr::select(-c(fracprod_per_cell, frac_bacprod)) %>%
  dplyr::filter(norep_filter_name != "MOTEJ515") %>% # Remove the missing row of data :( )
  dplyr::select(-c(norep_filter_name, norep_water_name))
```

####Run lasso regression  
**Lasso - Community Production: Particle**
```{r lasso-comm-part, fig.width=8, fig.height=4}
# Set the seed for reproducibility of the grid values 
set.seed(777) 

################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_comm_data <- 
  lasso_data_df_particle_noprod %>%    # Use data only for the particle samples  
  scale() %>%                          # Scale all of the variables to have mean =0 and sd = 1
  as.data.frame()                      # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
x = model.matrix(frac_bacprod ~ ., scaled_comm_data)[,-1]
y = scaled_comm_data$frac_bacprod
grid = 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
train <- sample(1:nrow(x), nrow(x)/2)
test <- -train
y_test <- y[test]

################ LASSO ################ 
# Run lasso regression with alpha = 1
lasso_divs_train <- glmnet(x[train,], y[train], alpha = 1, lambda = grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train)

# Cross validation
cv_lasso_divs <- cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv_lasso_divs)

best_lasso_lambda <- cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
lasso_lambda_1se <- cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
best_lasso_lambda == lasso_lambda_1se

lasso_divs_pred <- predict(lasso_divs_train, s = best_lasso_lambda, newx = x[test,])
mean((lasso_divs_pred - y_test)^2) # Test

## Run lasso on the entire dataset with the best lambda value 
lasso_divs <- glmnet(x, y, alpha = 1, lambda = best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs, type = "coefficients", s = best_lasso_lambda)

## Now, run the most simple, parsimonious lasso model within 1 standard error  
## IMPORTANT NOTE: This is the lasso reported within the manuscript
lasso_divs_1se <- glmnet(x, y, alpha = 1, lambda = lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(lasso_divs_1se, type = "coefficients", s = lasso_lambda_1se)
```

**Lasso - Community Production: Free**
```{r lasso-comm-free, fig.width=8, fig.height=4}
# Set the seed for reproducibility of the grid values 
set.seed(777) 
     
################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_comm_data_free <- 
  lasso_data_df_free_noprod %>%        # Use data only for the free-living samples  
  scale() %>%                          # Scale all of the variables to have mean = 0 and sd = 1
  as.data.frame()                      # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
free_x <- model.matrix(frac_bacprod ~ ., scaled_comm_data_free)[,-1]
free_y <- scaled_comm_data_free$frac_bacprod
free_grid <- 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
free_train <- sample(1:nrow(free_x), nrow(free_x)/2)
free_test <- -free_train
free_y_test <- free_y[free_test]

################ LASSO ################ 
# Run lasso regression with alpha = 1
lasso_divs_train_free_comm <- glmnet(free_x[free_train,], free_y[free_train], alpha = 1, lambda = free_grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(lasso_divs_train_free_comm)

# Cross validation
free_cv_lasso_divs <- cv.glmnet(free_x[free_train,], free_y[free_train], alpha = 1)
plot(free_cv_lasso_divs)

free_best_lasso_lambda <- free_cv_lasso_divs$lambda.min
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
free_lasso_lambda_1se <- free_cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
free_best_lasso_lambda == free_lasso_lambda_1se

free_lasso_divs_pred <- predict(lasso_divs_train_free_comm, s = free_best_lasso_lambda, newx = free_x[free_test,])
mean((free_lasso_divs_pred - y_test)^2) # Test

## Run lasso on the entire dataset with the best lambda value 
free_lasso_divs <- glmnet(free_x, free_y, alpha = 1, lambda = free_best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(free_lasso_divs, type = "coefficients", s = free_best_lasso_lambda)

## Now, run the most simple, parsimonious lasso model within 1 standard error  
## IMPORTANT NOTE: This is the lasso reported within the manuscript
free_lasso_divs_1se <- glmnet(free_x, free_y, alpha = 1, lambda = free_lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(free_lasso_divs_1se, type = "coefficients", s = free_lasso_lambda_1se)
```

**Lasso - Per Capita Production: Particle**
```{r lasso-perfacp-part, fig.width=8, fig.height=4}
# Set the seed for reproducibility of the grid values 
set.seed(777) 

################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_percapita_pa_data <- 
  percell_lasso_data_df_particle_noprod %>%   
  dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
  mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
  dplyr::select(-c(fracprod_per_cell_noinf)) %>%
  scale() %>%
  as.data.frame()                         # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
percap_pa_x = model.matrix(log10_percell ~ ., scaled_percapita_pa_data)[,-1]
percap_pa_y = scaled_percapita_pa_data$log10_percell
percap_pa_grid = 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
percap_pa_train <- sample(1:nrow(percap_pa_x), nrow(percap_pa_x)/2)
percap_pa_test <- -percap_pa_train
percap_pa_y_test <- percap_pa_y[percap_pa_test]

################ LASSO ################ 
# Run lasso regression with alpha = 1
percap_pa_lasso_divs_train <- glmnet(percap_pa_x[percap_pa_train,], percap_pa_y[percap_pa_train], alpha = 1, lambda = percap_pa_grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(percap_pa_lasso_divs_train)

# Cross validation
percap_pa_cv_lasso_divs <- cv.glmnet(percap_pa_x[percap_pa_train,], percap_pa_y[percap_pa_train], alpha = 1)
plot(percap_pa_cv_lasso_divs)

percap_pa_best_lasso_lambda <- percap_pa_cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
percap_pa_lasso_lambda_1se <- percap_pa_cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
percap_pa_best_lasso_lambda == percap_pa_lasso_lambda_1se

percap_pa_lasso_divs_pred <- predict(percap_pa_lasso_divs_train, s = percap_pa_best_lasso_lambda, newx = percap_pa_x[percap_pa_test,])
mean((percap_pa_lasso_divs_pred - percap_pa_y_test)^2) # Test

## Run lasso on the entire dataset with the best lambda value 
percap_pa_lasso_divs <- glmnet(percap_pa_x, percap_pa_y, alpha = 1, lambda = percap_pa_best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(percap_pa_lasso_divs, type = "coefficients", s = percap_pa_best_lasso_lambda)

## Now, run the most simple, parsimonious lasso model within 1 standard error  
## IMPORTANT NOTE: This is the lasso reported within the manuscript
percap_pa_lasso_divs_1se <- glmnet(percap_pa_x, percap_pa_y, alpha = 1, lambda = percap_pa_lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(percap_pa_lasso_divs_1se, type = "coefficients", s = percap_pa_lasso_lambda_1se)
```

**Lasso - Per Capita Production: Free**
```{r lasso-perfacp-free, fig.width=8, fig.height=4}
# Set the seed for reproducibility of the grid values 
set.seed(777) 

################ PREPARE DATA ################ 
# Subset data needed and scale all o
scaled_percapita_fl_data <- 
  percell_lasso_data_df_free_noprod %>%   
  dplyr::filter(!is.na(fracprod_per_cell_noinf)) %>%
  mutate(log10_percell = log10(fracprod_per_cell_noinf)) %>%
  dplyr::select(-c(fracprod_per_cell_noinf)) %>%
  scale() %>%
  as.data.frame()                         # Make it a dataframe so that model.matrix function works (does not take a matrix)

# Set model parameters for community level data
## NOTE: there cannot be any data with NA
percap_fl_x = model.matrix(log10_percell ~ ., scaled_percapita_fl_data)[,-1]
percap_fl_y = scaled_percapita_fl_data$log10_percell
percap_fl_grid = 10^seq(10,-2,length = 100)

################ PREPARE TRAINING & TESTING DATA FOR CROSS VALIDATION ################ 
# Pull out test and training sets for cross validation
# We will use half the set to train the model and the 2nd half of the dataset to test the model 
percap_fl_train <- sample(1:nrow(percap_fl_x), nrow(percap_fl_x)/2)
percap_fl_test <- -percap_fl_train
percap_fl_y_test <- percap_fl_y[percap_fl_test]

################ LASSO ################ 
# Run lasso regression with alpha = 1
percap_fl_lasso_divs_train <- glmnet(percap_fl_x[percap_fl_train,], percap_fl_y[percap_fl_train], alpha = 1, lambda = percap_fl_grid, standardize = FALSE)
par(mfrow = c(1,2))
plot(percap_fl_lasso_divs_train)

# Cross validation
percap_fl_cv_lasso_divs <- cv.glmnet(percap_fl_x[percap_fl_train,], percap_fl_y[percap_fl_train], alpha = 1)
plot(percap_fl_cv_lasso_divs)

percap_fl_best_lasso_lambda <- percap_fl_cv_lasso_divs$lambda.min # Minimum lambda
# https://stats.stackexchange.com/questions/138569/why-is-lambda-plus-1-standard-error-a-recommended-value-for-lambda-in-an-elastic
percap_fl_lasso_lambda_1se <- percap_fl_cv_lasso_divs$lambda.1se # simplest model whose accuracy is comparable with the best model
percap_fl_best_lasso_lambda == percap_fl_lasso_lambda_1se

percap_fl_lasso_divs_pred <- predict(percap_fl_lasso_divs_train, s = percap_fl_best_lasso_lambda, newx = percap_fl_x[percap_fl_test,])
mean((percap_fl_lasso_divs_pred - percap_fl_y_test)^2) # Test

## Run lasso on the entire dataset with the best lambda value 
percap_fl_lasso_divs <- glmnet(percap_fl_x, percap_fl_y, alpha = 1, lambda = percap_fl_best_lasso_lambda, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(percap_fl_lasso_divs, type = "coefficients", s = percap_fl_best_lasso_lambda)

## Now, run the most simple, parsimonious lasso model within 1 standard error  
## IMPORTANT NOTE: This is the lasso reported within the manuscript
percap_fl_lasso_divs_1se <- glmnet(percap_fl_x, percap_fl_y, alpha = 1, lambda = percap_fl_lasso_lambda_1se, standardize = FALSE)
# What are the lasso coefficients? (Anything with a . is not selected by the model)
predict(percap_fl_lasso_divs_1se, type = "coefficients", s = percap_fl_lasso_lambda_1se)
```


# Supplemental

## Figure S1
**Map of Muskegon Lake**
```{r Figure-S1}
# Load the ggmap package for plotting of Muskegon Lake
library(ggmap)

# Extract the coordinates for Muskegon Lake 
lakesite_coordinates <- read.csv("data/metadata/ML_GPS_Coordinates.csv") %>%
  dplyr::select(lakesite, Latitude, Longitude) %>%
  dplyr::rename(Lat = Latitude, Long = Longitude)

# Set the boundaries for the map 
map <- get_map(c(-86.35, 43.21, -86.235, 43.265),  #left/bottom/right/top
                 zoom = 13, maptype = "toner-background")

# Plot the Muskegon Lake Map 
ggmap(map) + xlab("Longitude") + ylab("Latitude") +
  theme(axis.text = element_text(size = 10, color = "black"),
        axis.title = element_text(size = 12, color = "black", face = "bold")) +
    geom_point(data = lakesite_coordinates, 
               aes(x=Long, y=Lat), color="black", fill = "yellow", size=4, shape = 21) +
    geom_text(data = lakesite_coordinates,
              aes(x=Long, y=Lat, label=lakesite), col="black", cex=4, vjust = 0.5,hjust = -0.25) 
```


## Figure S2
```{r Figure-S2, fig.width=10, fig.height=7}
# h = 0 = richness
# h = 1 = exp(Shannon)
# h = 2 = Inverse Simpsons

# Prepare data to color the figure for iNEXT
dat <- colnames(iNEXT_input_table) %>%  
  data.frame()  
colnames(dat)[1] <- "norep_filter_name"     
sub_metadata <- metadata[,1:6] %>%  
  tibble::rownames_to_column(var = "norep_filter_name")

# Combine for plotting below
dat_iNEXT <- dat %>%    
  left_join(sub_metadata, by = "norep_filter_name") %>%  
  mutate(fraction_color = ifelse(fraction == "Particle", "#FF6600", "skyblue"),       
         shape = ifelse(season == "Spring", 25,             
                        ifelse(season == "Summer", 23,                         
                               21)),                                  
         station_color = ifelse(lakesite == "Bear", "black",             
                                ifelse(lakesite == "Deep", "blue",                                 
                                       ifelse(lakesite == "Outlet", "skyblue",                                         
                                              "forestgreen"))),                                              
         season_color = ifelse(season == "Spring", "skyblue",          
                               ifelse(season == "Summer", "forestgreen",                                   
                                      "orange")))

# Sample‐size‐based R/E curves, separating by "site""
p_iNEXT <- ggiNEXT(iNEXT_data, type=1, facet.var="order") + facet_wrap(~order, scales="free") +
  scale_shape_manual(values = dat_iNEXT$shape, guide = FALSE) +
  scale_color_manual(values = dat_iNEXT$fraction_color,  guide = FALSE) +
  scale_fill_manual(values = dat_iNEXT$fraction_color, guide = FALSE) +
  theme(legend.position = "none") + theme(plot.title = element_text()); 

p_iNEXT_legend <- ggiNEXT(iNEXT_data, type=1, facet.var="order", grey = "true") + 
  facet_wrap(~order, scales="free") +
  guides(color = "none", shape = "none", fill = "none") + 
  theme(legend.text = element_text(size = 13))

# Extract the legend
iNEXT_legend <- cowplot::get_legend(p_iNEXT_legend)

# plot them together for top panel of figure 2
figS2_a <- plot_grid(p_iNEXT, iNEXT_legend, season_legend,
           ncol = 1, nrow = 3, labels = c("A", "", ""),
           rel_heights = c(1, 0.05, 0.05)); 


######################################
#### Correlations between phylogenetic & non-phylo Hill numbers
# Calculate R2 and pval for  it
lm_phylo_rich <- summary(lm(phylo_richness ~ richness, data =div_df))
lm_phylo_shan <- summary(lm(phylo_shannon ~ shannon, data = div_df))
lm_phylo_simps <- summary(lm(phylo_simpson ~ simpson, data = div_df))

# Make the 3 individual plots and then plot together
# 1. RICHNESS PLOT 
# Extract linear model output 
lab_lm_phylo_rich <- paste("atop(R^2 ==", round(lm_phylo_rich$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(lm_phylo_rich$coefficients[,4][2]), digits = 19), ")")
p1_phylo_rich <- 
  div_df %>%
  ggplot(aes(x = richness, y = phylo_richness)) + 
  geom_point() + geom_smooth(method = "lm") +
  labs(x = expression({}^0*italic(D)), 
       y = expression(~{}^0*italic(PD))) +
  annotate("text", x=600, y=150, label=lab_lm_phylo_rich, parse = TRUE, color = "blue", size = 4) 

# 2. SHANNON PLOT 
# Extract linear model output 
lab_lm_phylo_shan <- paste("atop(R^2 ==", round(lm_phylo_shan$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(lm_phylo_shan$coefficients[,4][2]), digits = 10), ")")
p2_phylo_shan <- 
  div_df %>%
  ggplot(aes(x = shannon, y = phylo_shannon)) + 
  geom_point() + geom_smooth(method = "lm") +
  labs(x = expression(~{}^1*italic(D)), 
       y = expression(~{}^1*italic(PD))) +
  annotate("text", x=100, y=15, label=lab_lm_phylo_shan, parse = TRUE, color = "blue", size = 4) 

# 3. SIMPSON PLOT 
# Extract linear model output 
lab_lm_phylo_simps <- paste("atop(R^2 ==", round(lm_phylo_simps$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(lm_phylo_simps$coefficients[,4][2]), digits = 5), ")")
p3_phylo_simps <- 
  div_df %>%
  ggplot(aes(x = simpson, y = phylo_simpson)) + 
  geom_point() + geom_smooth(method = "lm") +
  labs(x = expression(~{}^2*italic(D)), 
       y = expression(~{}^2*italic(PD))) +
  annotate("text", x=30, y=6, label=lab_lm_phylo_simps, parse = TRUE, color = "blue", size = 4) 

# Put the plots together
figS2_b <- coors_doubleton <- plot_grid(p1_phylo_rich, p2_phylo_shan, p3_phylo_simps,
          labels = c("B", "C", "D"),
          nrow = 1, ncol = 3); 


###### FIGURE S2
plot_grid(figS2_a, figS2_b, ncol = 1, nrow = 2, rel_heights = c(1, 0.8))

```

```{r iNEXT, fig.width=6, fig.height=3.5}
wilcox_rich_obs <- wilcox.test(Observed ~ fraction, data =dplyr::filter(div_df_frac,Diversity == "Species richness"))
wilcox_rich_est <- wilcox.test(Estimator ~ fraction, data = dplyr::filter(div_df_frac,Diversity == "Species richness"))

rich_p1 <- div_df_frac %>%
  dplyr::filter(Diversity == "Species richness") %>%
  ggplot(aes(x = fraction, y = Observed)) + 
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) + 
  scale_y_continuous(limits = c(0, 1800), breaks = seq(0, 1800, by = 300),
                     expand = c(0, 0)) +
  labs(y = "Observed Richness") +
  theme(axis.title.x = element_blank(), legend.position = "none") + 
  annotate("text", x=1.5, y=1500, size = 4, color = "gray40",
         label= paste("Wilcoxon, p =", round(wilcox_rich_obs$p.value, digits = 3))) 

rich_p2 <- div_df_frac %>%
  dplyr::filter(Diversity == "Species richness") %>%
  ggplot(aes(x = fraction, y = Estimator)) + 
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes( fill = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) + 
  scale_y_continuous(limits = c(0, 1800), breaks = seq(0, 1800, by = 300),
                     expand = c(0, 0))+
  labs(y = "Richness Estimator") +
  theme(axis.title.x = element_blank(), legend.position = "none") + 
  annotate("text", x=1.5, y=1500, size = 4, color = "gray40",
      label= paste("Wilcoxon, p =", round(wilcox_rich_est$p.value, digits = 3)))

# Put the plots together 
rich_title <- ggdraw() + 
  draw_label("iNEXT richness estimates", fontface = 'bold', x = 0, hjust = 0) + 
   theme(plot.margin = margin(0, 0, 0, 125))

rich_plotz <- plot_grid(rich_p1, rich_p2, 
          labels = c("A", "B"))

plot_grid(rich_title, rich_plotz, 
          ncol = 1, nrow = 2, rel_heights = c(0.1, 1))
```

## Figure S3
**Correlations between Particle and Free of values in Figure 1**
```{r Figure-S3, fig.width = 12, fig.height = 4}
work_df <- metadata %>%
  dplyr::select(norep_filter_name, fraction, fraction_bac_abund, frac_bacprod, fracprod_per_cell_noinf) %>%
  mutate(norep_water_name = paste(substr(norep_filter_name, 1,4), substr(norep_filter_name, 6,9), sep = "")) %>%
  dplyr::select(-norep_filter_name)

part_work_df <- work_df %>%
  filter(fraction == "Particle") %>%
  rename(part_bacabund = fraction_bac_abund,
         part_prod = frac_bacprod, 
         part_percell_prod = fracprod_per_cell_noinf) %>%
  dplyr::select(-fraction)

free_work_df <- work_df %>%
  filter(fraction == "Free") %>%
  rename(free_bacabund = fraction_bac_abund,
         free_prod = frac_bacprod, 
         free_percell_prod = fracprod_per_cell_noinf) %>%
  dplyr::select(-fraction)

byfrac_df <- part_work_df %>%
  left_join(free_work_df, by = "norep_water_name")

byfrac_df$season <- substr(byfrac_df$norep_water_name, 5,5) # 7th letter = month sampled
byfrac_df$season <- as.character(byfrac_df$season)
byfrac_df$season <- ifelse(byfrac_df$season == "5", "Spring", 
                             ifelse(byfrac_df$season == "7", "Summer", 
                                    ifelse(byfrac_df$season == "9", "Fall",
                                           "NA")))
byfrac_df$season <- factor(byfrac_df$season, levels = c("Spring", "Summer", "Fall"))

summary(lm(log10(part_bacabund) ~ log10(free_bacabund), data = filter(byfrac_df, norep_water_name != "MOTE515")))

plot1 <- ggplot(filter(byfrac_df, norep_water_name != "MOTE515"), 
       aes(x = log10(free_bacabund), y = log10(part_bacabund))) +
  xlab("Free") +  ylab("Particle") + 
  ggtitle(expression(atop(log[10]*"(Bacterial Counts)", "(cells/mL)"))) + 
  #ggtitle("Log10(Bacterial Counts) \n (cells/mL)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  theme(legend.position = "bottom",
        legend.title = element_blank())


################ Community-Wide production correlation between particle and free
## 1. Run the linear regression 
lm_prod_corr <- lm(part_prod ~ free_prod, data = byfrac_df)

# 2. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_PAvsFL <- paste("atop(R^2 ==", round(summary(lm_prod_corr)$adj.r.squared, digits = 2), ",",
            "p ==", round(unname(summary(lm_prod_corr)$coefficients[,4][2]), digits = 3), ")")

## 3. Plot Community production correlation between particle and free
plot2 <- ggplot(byfrac_df, aes(x = free_prod, y = part_prod)) +
  xlab("Free") +  ylab("Particle") + 
  ggtitle(expression(atop("Community Production", "(μgC/L/day)"))) +
  #ggtitle("Community Production \n(μgC/L/day)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  geom_smooth(method = "lm", color = "black")  + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=20, y=30, label=lm_lab_perliter_PAvsFL, parse = TRUE, color = "black", size = 4) +
  theme(legend.position = "none")


################ Per-Capita production correlation between particle and free
## 1. Run the linear regression 
lm_percell_corr <- lm(log10(part_percell_prod) ~ log10(free_percell_prod), data = byfrac_df)

# 2. Extract the R2 and p-value from the linear model: 
lm_lab_percell_PAvsFL <- paste("atop(R^2 ==", round(summary(lm_percell_corr)$adj.r.squared, digits = 2), ",",
            "p ==", round(unname(summary(lm_percell_corr)$coefficients[,4][2]), digits = 3), ")")

## 3. Plot Per-Capita production correlation between particle and free
plot3 <- ggplot(byfrac_df,
       aes(x = log10(free_percell_prod), y = log10(part_percell_prod))) +  
  xlab("Free") +  ylab("Particle") + 
  ggtitle(expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) + 
  #ggtitle("log10(Per-Capita Production)\n (μgC/cell/day)") +
  geom_point(size = 3, fill = "grey", aes(shape = season), width = 0.2) + 
  scale_shape_manual(values = season_shapes) +
  geom_smooth(method = "lm", color = "black") + 
  # Add the R2 and p-value to the plot 
  annotate("text", y = -5.8, x=-7.9, label=lm_lab_percell_PAvsFL, parse = TRUE, color = "black", size = 4) +
  theme(legend.position = "none")


# Make the final multi-paneled Plot
# Extract the legend
legend_s1 <- get_legend(plot1 + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))

# Make the 3 plots
top_row_S1 <- plot_grid(plot1 +theme(legend.position = "none"), plot2, plot3, 
          nrow = 1, ncol = 3, 
          labels = c("A", "B", "C"), 
          align = "h")

# Put the legend and 3 plots together
plot_grid(top_row_S1, legend_s1,
          rel_heights = c(1, 0.05),
          nrow = 2, ncol = 1)
```


## Figure S4
```{r Figure-S4, fig.width=9, fig.height=3.2}
# Locations of labels 
maxes <- c(1350, 299, 78, 1.35, 150, 15.2, 5.8, 0.1)
df_temp <- data.frame(hill_labels = site_div_labs, wc_pafl_pvals, fraction, maxes) 

figS4 <- long_div_meta_df %>% 
  dplyr::filter(Diversity %in% c("phylo_richness", "phylo_shannon", "phylo_simpson", "weighted_sesmpd")) %>%
  ggplot(aes(y = div_val, x = fraction, fill = fraction)) +
  ylab("Diversity Value By Fraction") +
  facet_wrap(hill_labels~., scales = "free", labeller = label_parsed, nrow = 2, ncol = 4) + 
  geom_point(size = 3, position = position_jitterdodge(), aes(shape = season)) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black") +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank()) +
  geom_text(data = dplyr::filter(df_temp, hill_labels %in% c("{}^0*italic(PD)", "{}^1*italic(PD)","{}^2*italic(PD)", "italic(Weighted_MPD)")), 
            aes(x = fraction, y = maxes, label = wc_pafl_pvals), size = 4.5, hjust = 0.5,color = "grey40") 

plot_grid(figS4, season_legend, nrow =2, ncol =1, rel_heights = c(1, 0.05))
```


## Figure S5
```{r prep1-Figure-S5, fig.width=6, fig.height=8}
#### Time series of PA vs FL diversity 

# Question: Is there a temporal effect on diversity differences between particle and Free?
# Calculate the means for plotting
italics_labeller <- c("{}^0*italic(D)", "{}^0*italic(PD)",
                      "{}^1*italic(D)", "{}^1*italic(PD)",
                      "{}^2*italic(D)","{}^2*italic(PD)")
div_order <- c("richness", "phylo_richness",
               "shannon", "phylo_shannon",
               "simpson", "phylo_simpson")

mean_div_df <- wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:phylo_richness, lakesite, fraction, season)) %>%
  gather(key = "Diversity", value = div_val, richness:phylo_richness) %>%
  mutate(norep_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::select(-norep_filter_name)  %>%
  tidyr::spread(key = fraction, value = div_val) %>%
  mutate(frac_difference = Particle-Free,
         Diversity = fct_relevel(Diversity, levels = div_order),
         hill_labels = factor(Diversity, labels = italics_labeller)) %>%
  group_by(Diversity, hill_labels, season) %>%
  summarize(mean_diff = mean(frac_difference), sd_diff = sd(frac_difference)) 

timeseries_df <- wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:phylo_richness, lakesite, fraction, season)) %>%
  gather(key = "Diversity", value = div_val, richness:phylo_richness) %>%
  mutate(norep_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::select(-norep_filter_name)  %>%
  tidyr::spread(key = fraction, value = div_val) %>%
  mutate(frac_difference = Particle-Free,
          Diversity = fct_relevel(Diversity, levels = div_order),
         hill_labels = factor(Diversity, labels = italics_labeller))

##### Prepare stats
# Reorder the data frame for calculating the stats 
wc_div_df <- wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:phylo_richness, lakesite, fraction, season)) %>%
  gather(key = "Diversity", value = div_val, richness:phylo_richness) %>%
  mutate(norep_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::select(-norep_filter_name)  %>%
  tidyr::spread(key = fraction, value = div_val) %>%
  mutate(frac_difference = Particle-Free) 

#  Plot all of the metrics together 
ggplot(timeseries_df, aes(x = season, y = frac_difference)) + 
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=Inf,alpha=0.2,fill="#FF6600")+
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=0,alpha=0.2,fill="skyblue")+
  facet_wrap(hill_labels~., scales = "free", labeller = label_parsed, nrow = 3, ncol = 2) + 
  geom_hline(yintercept = 0, color = "grey") + 
  geom_jitter(aes(shape = lakesite), fill = "grey", size = 3, width = 0.1) + 
  geom_point(data = mean_div_df, aes(x = season, y = mean_diff), shape = 95, size = 20) +
  geom_line(data = mean_div_df, aes(x = season, y = mean_diff, group = Diversity), size = 1.5, color = "black", alpha = 0.5) + 
  scale_shape_manual(values = lakesite_shapes) +
  theme(axis.title = element_blank(), #legend.title = element_blank(), 
        legend.position = "bottom", 
        strip.background = element_rect(colour = "grey", fill = "white"),
        strip.text = element_text(face = "bold"))  
```

```{r prep2-Figure-S5, fig.width=3.5, fig.height=8}
# Subset dataframes for mean plotting
mean_div_df_rich <- dplyr::filter(mean_div_df, Diversity == "richness")
mean_div_df_shan <- dplyr::filter(mean_div_df, Diversity == "shannon")
mean_div_df_simps <- dplyr::filter(mean_div_df, Diversity == "simpson")

# To calculate the significance between seasons - I'll use a pairwise wilcoxon test below 
# Are there significant differences in the fraction diversity over the three seasons? 
# Richness 
wilcox.test(frac_difference ~ season, 
            data = dplyr::filter(wc_div_df, Diversity == "richness" &
                                  season %in% c("Spring", "Summer")))  # NS 
wilcox.test(frac_difference ~ season, 
            data = dplyr::filter(wc_div_df, Diversity == "richness" &  
                                  season %in% c("Summer", "Fall"))) # Marginal, p = 0.06 

wx_0D_SpFa <- wilcox.test(frac_difference ~ season, 
            data = dplyr::filter(wc_div_df, Diversity == "richness" & 
                                  season %in% c("Spring", "Fall"))) # Significant!

dat_0D_SpFa <- data.frame(a = c(1.15,1.15,2.85,2.85), b = c(730, 750, 750, 730)) # WholePart vs WholeFree

timeseries_0D <-  
  timeseries_df %>%
  dplyr::filter(Diversity == "richness") %>%
  ggplot(aes(x = season, y = frac_difference)) + 
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=Inf,alpha=0.3,fill="#FF6600") +
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=0,alpha=0.3,fill="skyblue") +
  geom_hline(yintercept = 0, color = "grey") + 
  geom_jitter(aes(shape = lakesite), fill = "grey", size = 3, width = 0.1) + 
  geom_point(data = mean_div_df_rich, aes(x = season, y = mean_diff), shape = 95, size = 20) +
  geom_line(data = mean_div_df_rich, aes(x = season, y = mean_diff, group = Diversity), size = 1.5, color = "black", alpha = 0.5) + 
  labs(y = expression("Particle"~{}^0*italic(D)~"– Free"~{}^0*italic(D))) +
  scale_shape_manual(values = lakesite_shapes) +
  geom_path(data = dat_0D_SpFa, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=2, y=790, size = 8, color = "#424645", label= paste("*"), angle = 90) +
  annotate("text", x=2, y=700, size = 4, color = "#424645",
           label= paste("p =", round(wx_0D_SpFa$p.value, digits = 2))) +
  theme(legend.position = "none", 
        axis.title.x = element_blank(), axis.text.x = element_blank(),
        #  specify top, right, bottom, and left margins
        plot.margin = margin(0, 0, 0, 0.6, "cm"))

# Shannon 
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "shannon" &
                                  season %in% c("Spring", "Summer")))  # NS 

wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "shannon" & 
                                  season %in% c("Summer", "Fall")))  # NS 

wx_1D_SpFa <- wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "shannon" & 
                                  season %in% c("Spring", "Fall"))) # Significant!
dat_1D_SpFa <- data.frame(a = c(1.15,1.15,2.85,2.85), b = c(280, 290, 290, 280)) # WholePart vs WholeFree

timeseries_1D <-  timeseries_df %>%
  dplyr::filter(Diversity == "shannon") %>%
  ggplot(aes(x = season, y = frac_difference)) + 
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=Inf,alpha=0.3,fill="#FF6600") +
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=0,alpha=0.3,fill="skyblue") +
  geom_hline(yintercept = 0, color = "grey") + 
  geom_jitter(aes(shape = lakesite), fill = "grey", size = 3, width = 0.1) + 
  geom_point(data = mean_div_df_shan, aes(x = season, y = mean_diff), shape = 95, size = 20) +
  geom_line(data = mean_div_df_shan, aes(x = season, y = mean_diff, group = Diversity), size = 1.5, color = "black", alpha = 0.5) + 
  labs(y = expression("Particle"~{}^1*italic(D)~"– Free"~{}^1*italic(D))) +
  scale_shape_manual(values = lakesite_shapes) +
  geom_path(data = dat_1D_SpFa, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=2, y=310, size = 8, color = "#424645", label= paste("*"), angle = 90) +
  annotate("text", x=2, y=270, size = 4, color = "#424645",
           label= paste("p =", round(wx_1D_SpFa$p.value, digits = 2))) +
  theme(legend.position = "none", 
        axis.title.x = element_blank(), axis.text.x = element_blank(),
        #  specify top, right, bottom, and left margins
        plot.margin = margin(0, 0, 0, 0.6, "cm"))

# Simpson 
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "simpson" & 
                                  season %in% c("Spring", "Summer"))) # NS 

wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "simpson" & 
                                  season %in% c("Summer", "Fall")))  # NS 

wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "simpson" & 
                                  season %in% c("Spring", "Fall"))) # Marginal significance = 0.06

timeseries_2D <-  timeseries_df %>%
  dplyr::filter(Diversity == "simpson") %>%
  ggplot(aes(x = season, y = frac_difference)) + 
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=0, ymax=Inf,alpha=0.3,fill="#FF6600") +
  annotate("rect", xmin=-Inf, xmax=Inf, ymin=-Inf, ymax=0,alpha=0.3,fill="skyblue") +
  geom_hline(yintercept = 0, color = "grey") + 
  geom_jitter(aes(shape = lakesite), fill = "grey", size = 3, width = 0.1) + 
  geom_point(data = mean_div_df_simps, aes(x = season, y = mean_diff), shape = 95, size = 20) +
  geom_line(data = mean_div_df_simps, aes(x = season, y = mean_diff, group = Diversity), size = 1.5, color = "black", alpha = 0.5) + 
  labs(y = expression("Particle"~{}^2*italic(D)~"– Free"~{}^2*italic(D))) +
  scale_shape_manual(values = lakesite_shapes) +
  theme(legend.position = "bottom", legend.title = element_blank(), 
        axis.title.x = element_blank(),
        #  specify top, right, bottom, and left margins
        plot.margin = margin(0, 0, 0, 0.6, "cm"))  

# PUT THEM TOGETHER 
ts_legend <- get_legend(timeseries_2D + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))

ts_p1 <- plot_grid(timeseries_0D, timeseries_1D, 
          timeseries_2D + theme(legend.position = "none"),
          nrow = 3, ncol = 1, labels = c("A","B", "C"),
          align = "hv")

ts_pp <- plot_grid(ts_p1, ts_legend, ncol = 1, nrow= 2,
          rel_heights = c(1, 0.05)); ts_pp

# Are there significant differences in the fraction diversity over the three seasons? 
# Phylogenetic Richness 
wilcox.test(frac_difference ~ season, 
            data = dplyr::filter(wc_div_df, Diversity == "phylo_richness" & 
                                  season %in% c("Spring", "Summer"))) # NS  
wilcox.test(frac_difference ~ season, 
            data = dplyr::filter(wc_div_df, Diversity == "phylo_richness" & 
                                  season %in% c("Summer", "Fall"))) # Significant!
wilcox.test(frac_difference ~ season, 
            data = dplyr::filter(wc_div_df, Diversity == "phylo_richness" & 
                                  season %in% c("Spring", "Fall"))) # Marginal significance = 0.06

# Phylogenetic Shannon 
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "phylo_shannon" &
                                  season %in% c("Spring", "Summer"))) # NS
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "phylo_shannon" & 
                                  season %in% c("Summer", "Fall"))) # NS
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "phylo_shannon" & 
                                  season %in% c("Spring", "Fall"))) # Significant!

# Phylogenetic Simpson 
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "phylo_simpson" & 
                                  season %in% c("Spring", "Summer"))) # NS
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "phylo_simpson" & 
                                  season %in% c("Summer", "Fall"))) # NS
wilcox.test(frac_difference ~ season, 
            data =dplyr::filter(wc_div_df, Diversity == "phylo_simpson" & 
                                  season %in% c("Spring", "Fall"))) # NS
```

```{r prep3-Figure-S5, fig.width=3.5, fig.height=4.2}
#################################################
######### PLOT THE HILL NUMBERS BY Q ############
#################################################
hill_nums <-  c(0,0,1,1,2,2)
div_orderr <- c("richness", "phylo_richness",
                "shannon", "phylo_shannon",
                "simpson", "phylo_simpson")

hill_nums_p <- wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:phylo_richness, lakesite, fraction, season)) %>%
  gather(key = Diversity, value = div_val, richness:phylo_richness) %>%
  mutate(Diversity = fct_relevel(Diversity, levels = div_orderr),
         hill_nums = factor(Diversity, labels = hill_nums),
         norep_water_name = paste(substr(norep_filter_name, 1,3), substr(norep_filter_name, 6,8), sep = "")) %>% 
  dplyr::filter(Diversity %in% c("richness", "shannon", "simpson")) %>%
  ggplot(aes(x = hill_nums, y = div_val, fill = fraction, shape = season, group = norep_filter_name)) +
  geom_point(size = 3, alpha = 0.8) +  
  labs(x = "q", y = "Hill Number") + 
  #facet_grid(.~fraction)+
  geom_line(size = 1.5, aes(color = fraction), alpha = 0.5) +
  scale_y_continuous(limits = c(0, 1500), breaks = seq(0, 1500, by = 250), expand = c(0,0)) +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) +
  scale_color_manual(values = fraction_colors) +
  theme(legend.position = "none")


## 
hill_nums_p2 <- plot_grid(hill_nums_p, season_2row_legend, nrow = 2, ncol = 1, rel_heights = c(1, 0.1)); hill_nums_p2
```

```{r Figure-S5, fig.width=7, fig.height=8}
# 2nd panel for Figure S4
tessst <- plot_grid(NULL, hill_nums_p2, NULL, ncol = 1, nrow = 3, rel_heights = c(0.74, 1, 0.6),labels = c("", "D", ""))

# Put first and second columns together! 
# Figure S4
plot_grid(ts_pp, tessst, ncol = 2)
```

## Figure S6
**Compare Diversity Estimates across Station and Season**
```{r Figure-S6, fig.width=12, fig.height=10}
########## PLOT 
divs_PAFLA_plot_lakesite <- 
  long_div_meta_df %>%
  ggplot(aes(y = div_val, x = lakesite, shape = lakesite)) +
  ylab("Mean Diversity Value\n By Lake Station") +
  facet_wrap(hill_labels~., scales = "free", labeller = label_parsed, nrow = 2, ncol = 4) + 
    geom_point(size = 3, aes(fill = fraction), color = "black", position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black", aes(fill = fraction)) +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = lakesite_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1)) 

plot_A <- plot_grid(divs_PAFLA_plot_lakesite, lakesite_legend,
          nrow = 2, ncol = 1,  labels = c("A",""),
          rel_heights = c(1, 0.05))


divs_PAFLA_plot_season <- 
  long_div_meta_df %>%
  ggplot(aes(y = div_val, x = season, shape = season)) +
  ylab("Mean Diversity Value \n By Season") +
  facet_wrap(hill_labels~., scales = "free", labeller = label_parsed, nrow = 2, ncol = 4) + 
  geom_point(size = 3, aes(fill = fraction), color = "black", position = position_jitterdodge()) +
  geom_boxplot(alpha = 0.5, outlier.shape = NA, color = "black", aes(fill = fraction)) +
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) + 
  scale_color_manual(values = fraction_colors) + 
  theme(legend.position = "none", axis.title.x = element_blank(), 
        axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1)) 

plot_B <- plot_grid(divs_PAFLA_plot_season, season_legend,
          nrow = 2, ncol = 1, labels = c("B",""),
          rel_heights = c(1, 0.05))

###### PLOT FIGURE S6
plot_grid(plot_A, plot_B,
          nrow = 2, ncol = 1)
```

## Figure S7
```{r Figure-S7, fig.width=4, fig.height=7}

figS7_pvals <- c(summary(lm_prod_vs_shannon_PA)$coefficients[,4][2], summary(lm_percell_prod_vs_shannon_PA)$coefficients[,4][2])
figS7_adjusted_pvals <- p.adjust(figS7_pvals, method = "fdr")

######################################################### 1D = SHANNON DIVERSITY (EXP(SHANNON ENTROPY))
######################################################### 1D = SHANNON DIVERSITY (EXP(SHANNON ENTROPY))
shannon_fraction_wilcox <- wilcox.test(shannon ~ fraction, data = wide_div_meta_df)
shannon_fraction_wilcox   

wide_div_meta_df%>%
  group_by(fraction) %>%
  summarize(mean(shannon), sd(shannon), min(shannon), max(shannon))

# Make a data frame to draw significance line in boxplot (visually calculated)
shannon_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(300, 305, 305, 300)) # WholePart vs WholeFree

shannon_distribution_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(y = shannon, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(shape = season, fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = c(0,325), breaks = seq(from = 0, to = 300, by = 50)) + 
  labs(y = expression({}^1*italic(D)),
       x = "Fraction") +
  scale_shape_manual(values = season_shapes) +
    # Add line and pval
  geom_path(data = shannon_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=325, size = 8, color = "#424645", label= paste("**"), angle = 90) +
  annotate("text", x=1.5, y=250, size = 4, color = "#424645",
           label= paste("p =", round(shannon_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Shannon vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS7_adjusted_pvals[1]), digits = 3), ")")

## 2. Plot Shannon vs Community-wide (Per-Liter) Production 
prod_vs_shannon_plot <-  
  wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Shannon diversity"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=shannon, y=frac_bacprod)) + 
  geom_errorbarh(aes(xmin = shannon - se_iNEXT, xmax = shannon + se_iNEXT, 
                     color = fraction), alpha = 0.7) + # X-axis errorbars
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  scale_fill_manual(values = fraction_colors) +
  labs(x = expression({}^1*italic(D)),
       y = expression(atop("Community Production", "(μgC/L/day)"))) +
  scale_x_continuous(limits = c(0,325), breaks = seq(from = 0, to = 300, by = 50)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=250, y=45, label=lm_lab_perliter_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") 


################ Shannon vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_shannon_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_shannon_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS7_adjusted_pvals[2]), digits = 3), ")")

## 2. Plot Shannon vs Per Capitra (Per-Cell) Production 
percell_prod_vs_shannon_plot <-   
  wide_div_meta_df %>%
  # Fetch the standard error from diversity measure to plot error bars 
  left_join(filter(div_iNEXT, Diversity == "Shannon diversity"), by = "norep_filter_name") %>%
  rename(se_iNEXT ="s.e.") %>%
  ggplot(aes(x=shannon, y=log10(fracprod_per_cell_noinf))) + 
  geom_errorbarh(aes(xmin = shannon - se_iNEXT, xmax = shannon + se_iNEXT, 
                     color = fraction), alpha = 0.7) +
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^1*italic(D)),
       y = expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) +
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = c(0,325), breaks = seq(from = 0, to = 300, by = 50)) + 
  scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=250, y=-7.5, label=lm_lab_percell_shannon_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none")


shannon_plots <- plot_grid(prod_vs_shannon_plot + theme(plot.margin = unit(c(0.2,0.2,0.2,0.8), "cm")), 
                           percell_prod_vs_shannon_plot + theme(legend.position = "none"),
                                   labels = c("A", "B"), ncol = 1, nrow = 2,
                                   rel_heights = c(1,1), align = "v")

######## PLOT FIGURE S6
plot_grid(shannon_plots, season_2row_legend,
                   ncol = 1, nrow = 2, 
                   rel_heights = c(1, 0.08))
```

## Figure S8
**PD and Productivity**
```{r pre-Figure-S8}
# FDR Correction 
figS8_pvals <- c(summary(lm_prod_vs_phylorich_PA)$coefficients[,4][2], summary(lm_percell_prod_vs_phylorich_PA)$coefficients[,4][2],
                 summary(lm_prod_vs_phyloshan_PA)$coefficients[,4][2], summary(lm_percell_prod_vs_phyloshan_PA)$coefficients[,4][2],
                summary(lm_prod_vs_phylosimps_PA)$coefficients[,4][2],summary(lm_percell_prod_vs_phylosimps_PA)$coefficients[,4][2]);
figS8_adjusted_pvals <- p.adjust(figS8_pvals, method = "fdr")

######################################################### PHYLOGENETIC RICHNESS ######################################################### 
# Set axis scales for phylo_richness
phylorich_limits <- c(25, 175)
phylorich_breaks <- seq(from = 25, to =175, by = 50)

#### Are particles more phylogenetically rich compared to free-living?
phylorich_fraction_wilcox <- wilcox.test(phylo_richness ~ fraction, data = wide_div_meta_df)
phylorich_fraction_wilcox

wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(mean(phylo_richness), sd(phylo_richness), min(phylo_richness), max(phylo_richness))

# Make a data frame to draw significance line in boxplot (visually calculated)
phylorich_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(155, 160, 160, 155)) # WholePart vs WholeFree

phylorich_distribution_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(y = phylo_richness, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = phylorich_limits, breaks = phylorich_breaks, expand = c(0, 0)) + 
  labs(y = expression({}^0*italic(PD)),
       x = "Fraction") +
  scale_shape_manual(values = season_shapes) +
  geom_path(data = phylorich_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=170, size = 8, color = "#424645", label= paste("**"), angle = 90) +
  annotate("text", x=1.5, y=135, size = 4, color = "#424645",
           label= paste("p =", round(phylorich_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Richness vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_phylorich_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_phylorich_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS8_adjusted_pvals[1]), digits = 2), ")")

## 2. Plot Richness vs Community-wide (Per-Liter) Production 
prod_vs_phylorich_plot <-  
  wide_div_meta_df %>%
  ggplot(aes(x=phylo_richness, y=frac_bacprod)) + 
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, 
                    color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^0*italic(PD)),
       y = expression(atop("Community Production", "(μgC/L/day)"))) +
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), 
              method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = phylorich_limits, breaks = phylorich_breaks, expand = c(0, 0)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
   # Add the R2 and p-value to the plot 
  annotate("text", x=130, y=50, label=lm_lab_perliter_phylorich_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") 

# Summary stats 
summary(lm(frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(frac_bacprod ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Particle" & richness < 1000)))


################ Richness vs Per Capita (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_phylorich_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_phylorich_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS8_adjusted_pvals[2]), digits = 2), ")")

## 2. Plot Richness vs Per Capita (Per-Cell) Production 
percell_prod_vs_phylorich_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(x=phylo_richness, y=log10(fracprod_per_cell_noinf))) + 
  geom_point(size = 3.5,  color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^0*italic(PD)),
       y = expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) +
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), 
              method='lm', color = "#FF6600", fill = "#FF6600") + 
  scale_x_continuous(limits = phylorich_limits, breaks = phylorich_breaks, expand = c(0, 0)) +   
  scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=150, y=-7.5, label=lm_lab_percell_phylorich_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

# Summary stats 
summary(lm(log10(fracprod_per_cell_noinf) ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(log10(fracprod_per_cell_noinf) ~ phylo_richness, data = filter(wide_div_meta_df, fraction == "Particle" & phylo_richness < 130)))


######################################################### 1D = PHYLOGENETIC SHANNON DIVERSITY (EXP(SHANNON ENTROPY))
######################################################### 1D = PHYLOGENETIC SHANNON DIVERSITY (EXP(SHANNON ENTROPY))
# Set axis scales for phylo_richness
phyloshan_limits <- c(0, 20)
phyloshan_breaks <- seq(from = 0, to =20, by = 5)

phyloshannon_fraction_wilcox <- wilcox.test(phylo_shannon ~ fraction, data = wide_div_meta_df)
phyloshannon_fraction_wilcox   

wide_div_meta_df%>%
  group_by(fraction) %>%
  summarize(mean(phylo_shannon), sd(phylo_shannon), min(phylo_shannon), max(phylo_shannon))

# Make a data frame to draw significance line in boxplot (visually calculated)
phyloshannon_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(18.25, 18.5, 18.5, 18.25)) # WholePart vs WholeFree

phyloshan_distribution_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(y = phylo_shannon, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  geom_jitter(size = 3, aes(shape = season, fill = fraction), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = phyloshan_limits, breaks = phyloshan_breaks, expand = c(0,0)) + 
   labs(y = expression({}^1*italic(PD)),
       x = "Fraction") +
  scale_shape_manual(values = season_shapes) +
    # Add line and pval
  geom_path(data = phyloshannon_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=20, size = 8, color = "#424645", label= paste("**"), angle = 90) +
  annotate("text", x=1.5, y=15.5, size = 4, color = "#424645",
           label= paste("p =", round(phyloshannon_fraction_wilcox$p.value, digits = 3))) +
  theme(legend.position = "none",# axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()

################ Shannon vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_phyloshan_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_phyloshan_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS8_adjusted_pvals[3]), digits = 2), ")")

## 2. Plot Shannon vs Community-wide (Per-Liter) Production 
prod_vs_phyloshan_plot <-  
  wide_div_meta_df %>%
  ggplot(aes(x=phylo_shannon, y=frac_bacprod)) + 
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  scale_fill_manual(values = fraction_colors) +
  labs(x = expression({}^1*italic(PD)),
       y = expression(atop("Community Production", "(μgC/L/day)"))) +
  scale_x_continuous(limits = phyloshan_limits, breaks = phyloshan_breaks, expand = c(0,0)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = NA, linetype = "dotted") + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=15, y=50, label=lm_lab_perliter_phyloshan_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.position = "none") 


################ Shannon vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_phyloshan_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_phyloshan_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS8_adjusted_pvals[4]), digits = 2), ")")

## 2. Plot Shannon vs Per Capitra (Per-Cell) Production 
percell_prod_vs_phyloshan_plot <-   
  wide_div_meta_df %>%
  ggplot(aes(x=phylo_shannon, y=log10(fracprod_per_cell_noinf))) + 
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^1*italic(PD)),
       y = expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) +
  geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = NA, linetype = "dotted") + 
  scale_x_continuous(limits = phyloshan_limits, breaks = phyloshan_breaks, expand = c(0,0)) + 
  scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) + 
  # Add the R2 and p-value to the plot 
  annotate("text", x=15, y=-7.5, label=lm_lab_percell_phyloshan_PA, parse = TRUE, color = "#FF6600", size = 4) +
  theme(legend.title = element_blank(), legend.position = "none")


phyloshan_plots <- plot_grid(phyloshan_distribution_plot, prod_vs_phyloshan_plot, percell_prod_vs_phyloshan_plot,
          #labels = c("B", "D", "F"), 
          ncol = 1, nrow = 3,
          rel_heights = c(0.5, 0.8, 1.1),
          align = "v")


######################################################### 2D = SIMPSON DIVERSITY (INVERSE SIMPSON)
######################################################### 2D = SIMPSON DIVERSITY (INVERSE SIMPSON)
# Set axis scales for phylo_richness
phylosimps_limits <- c(1, 7)
phylosimps_breaks <- seq(from = 1, to = 7, by = 1)

phylosimps_fraction_wilcox <- wilcox.test(phylo_simpson ~ fraction, data = wide_div_meta_df)
phylosimps_fraction_wilcox

wide_div_meta_df %>%
  group_by(fraction) %>%
  summarize(mean(phylo_simpson), sd(phylo_simpson), min(phylo_simpson), max(phylo_simpson))

# Make a data frame to draw significance line in boxplot (visually calculated)
phylosimps_nums <- data.frame(a = c(1.15,1.15,1.85,1.85), b = c(6.1,6.2,6.2,6.1)) # WholePart vs WholeFree

phylosimps_distribution_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(y = phylo_simpson, x = fraction)) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  geom_jitter(size = 3,  aes(fill = fraction, shape = season), width = 0.2) + 
  geom_boxplot(alpha = 0.5, outlier.shape = NA, aes(fill = fraction)) +
  scale_y_continuous(limits = phylosimps_limits, breaks = phylosimps_breaks, expand = c(0,0)) + 
  labs(y = expression({}^2*italic(PD)),
       x = "Fraction") +
  geom_path(data = phylosimps_nums, aes(x = a, y = b), linetype = 1, color = "#424645") +
  annotate("text", x=1.5, y=6.5, size = 4, color = "#424645", label= "NS") +
  theme(legend.position = "none", #axis.title.y = element_blank(),
        axis.title.x = element_blank(), axis.text.x = element_blank()) +
  coord_flip()


################ Inverse Simpson vs Community-wide (Per-Liter) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_perliter_phylosimps_PA <- paste("atop(R^2 ==", round(summary(lm_prod_vs_phylosimps_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS8_adjusted_pvals[5]), digits = 2), ")")

## 2. Plot Inverse Simpson vs Community-wide (Per-Liter) Production 
prod_vs_phylosimps_plot <-  
  wide_div_meta_df %>%
  ggplot(aes(x=phylo_simpson, y=frac_bacprod)) + 
  geom_errorbar(aes(ymin = frac_bacprod - SD_frac_bacprod, ymax = frac_bacprod + SD_frac_bacprod, color = fraction)) +  # Y-axis errorbars
  geom_point(size = 3.5, color = "black", aes(fill = fraction, shape = season)) +
  scale_color_manual(values = fraction_colors) +
  scale_fill_manual(values = fraction_colors) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^2*italic(PD)),
       y = expression(atop("Community Production", "(μgC/L/day)"))) +
  scale_x_continuous(limits = phylosimps_limits, breaks = phylosimps_breaks, expand = c(0,0)) + 
  scale_y_continuous(limits = c(-5, 75), breaks = seq(0, 75, by = 15)) +
  theme(legend.position = "none") 

# Summary stats 
summary(lm(frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(frac_bacprod ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Particle" & simpson < 70)))

################ Inverse Simpson vs Per Capitra (Per-Cell) Production 
## 1. Extract the R2 and p-value from the linear model: 
lm_lab_percell_phylosimps_PA <- paste("atop(R^2 ==", round(summary(lm_percell_prod_vs_phylosimps_PA)$adj.r.squared, digits = 2), ",",
             "p ==", round(unname(figS8_adjusted_pvals[6]), digits = 2), ")")

## 2. Plot Inverse Simpson vs Per Capitra (Per-Cell) Production 
percell_prod_vs_phylosimps_plot <- 
  wide_div_meta_df %>%
  ggplot(aes(x=phylo_simpson, y=log10(fracprod_per_cell_noinf))) + 
  geom_point(size = 3.5, color = "black", aes(shape = season, fill = fraction)) +
  scale_color_manual(values = fraction_colors, guide = TRUE) +
  scale_fill_manual(values = fraction_colors, guide = FALSE) +
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^2*italic(PD)),
       y = expression(atop(log[10]*"(Per-Capita Production)", "(μgC/cell/day)"))) +
  scale_x_continuous(limits = phylosimps_limits, breaks = phylosimps_breaks, expand = c(0,0)) + 
  scale_y_continuous(limits = c(-8.5,-5), breaks = seq(from = -8, to =-5, by = 1)) + 
  #geom_smooth(data=filter(wide_div_meta_df, fraction == "Particle"), method='lm', color = "#FF6600", fill = NA, linetype = "dotted") + 
  # Add the R2 and p-value to the plot 
  #annotate("text", x=6, y=-7.5, label=lm_lab_percell_phylosimps_PA, parse = TRUE, color = "#FF6600", size = 4) +
  guides(color = guide_legend(override.aes = list(shape = 15))) +
  theme(legend.title = element_blank(), legend.position ="bottom", 
        legend.text = element_text(size = 14))

# Summary stats 
summary(lm(log10(fracprod_per_cell_noinf) ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Particle")))
# WITHOUT THE TWO HIGH POINTS 
summary(lm(log10(fracprod_per_cell_noinf) ~ phylo_simpson, data = filter(wide_div_meta_df, fraction == "Particle" & simpson < 70)))
```

```{r Figure-S8, fig.width=11, fig.height=6.5}
# All plots together 
phydiv_BEF <- plot_grid(prod_vs_phylorich_plot + theme(plot.margin = unit(c(0.2,0.2,0.2,0.8), "cm")), 
                        prod_vs_phyloshan_plot, prod_vs_phylosimps_plot,
          percell_prod_vs_phylorich_plot + theme(legend.position = "none"),
          percell_prod_vs_phyloshan_plot + theme(legend.position = "none"),
          percell_prod_vs_phylosimps_plot + theme(legend.position = "none"),
          labels = c("A", "B", "C", "D", "E", "F"), ncol = 3, nrow = 2, 
          align = "hv")

plot_grid(phydiv_BEF, season_legend, rel_heights = c(1, 0.05), nrow = 2, ncol = 1)
```


## Figure S10
```{r Figure-S10, fig.width=6, fig.height=3}
# What about weighted MPD and phylogenetic shannon?
summary(lm(weighted_sesmpd ~ phylo_shannon, data = wide_div_meta_df))

phy_MPD_p1 <- wide_div_meta_df %>%
  ggplot(aes(x = phylo_shannon, y = weighted_sesmpd, fill = fraction)) +
  geom_point(aes(shape = season), size = 3) + 
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) + 
  labs(x = expression({}^1*italic(PD)),
     y = "Weighted MPD") +
  theme(legend.position = "none")

# What about weighted MPD and phylogenetic shannon?
summary(lm(weighted_sesmpd ~ phylo_simpson, data = wide_div_meta_df))

phy_MPD_p2 <- wide_div_meta_df %>%
  ggplot(aes(x = phylo_simpson, y = weighted_sesmpd, fill = fraction)) +
  geom_point(aes(shape = season), size = 3) + 
  scale_fill_manual(values = fraction_colors) + 
  scale_shape_manual(values = season_shapes) +
  labs(x = expression({}^2*italic(PD)),
       y = "Weighted MPD") +
  theme(legend.position = "none")

# Figure S9
plot_grid(phy_MPD_p1, phy_MPD_p2, nrow = 1, ncol = 2, labels = c("A", "B"))

```

## Figure S11
**RDA with Euclidian Distances**
```{r Figure-S11, fig.height=4, fig.width=5}
# Make the Supplemental Figure
par(mar = c(4.5, 4.5, 1, 1), mfrow = c(1,1))
biplot(pca_environ,
     xlab = paste("PC1", paste(round(summary(pca_environ)$cont$importance[2,1]*100, digits = 2), "%", sep = ""), sep = ": "),
     ylab = paste("PC2", paste(round(summary(pca_environ)$cont$importance[2,2]*100, digits = 2), "%", sep = ""), sep = ": "))

# Plot the amount of variation explained by each of the axes.
par(mar = c(5,5,2,5))
plot(summary(pca_environ)$cont$importance[2,]*100, 
     xlab = "PCA Axis", 
     ylab = "Variation Explained Per Axis",
     ylim = c(0, 105),
     col = "cornflowerblue",
     cex =2,
     pch = 16)
par(new = T)
plot(summary(pca_environ)$cont$importance[3,]*100,
       cex = 2,
       pch = 17,
       ylim = c(0, 105),
       col = "firebrick3",
       axes=F, 
       xlab=NA, 
       ylab=NA)
axis(side = 4)
mtext(side = 4, line = 3, "Total Accumulated Variation")
legend("right",
       legend=c("Per Axis", "Total"),
       pch=c(16, 17), col=c("cornflowerblue", "firebrick3"))

# Amount of variation explained by the first two axes
summary(pca_environ)$cont$importance[3, 2]*100
```

The first two axes of the PCA explain ~70% of the variation in the environmental data!

## Table S1
```{r Table-S1}
tableS1 <- individual_lm_results %>%
  dplyr::filter(dependent_var == "Community Production" & FDR.p.value < 0.05) %>%  
  dplyr::select(-c(dependent_var, p.value)) %>%
  arrange(-logLik)
#write.csv(tableS1, file = "tables/Table-S1.csv", quote = FALSE, row.names = FALSE) 

#### Community-Wide Production    
datatable(tableS1,
       caption = "Table S1: Ordinary least squares regression statistics for community-wide heterotrophic production with a FDR-corrected p-value of less than 0.05.", 
       options = list(pageLength = 50), rownames = FALSE)   
```

## Table S2
```{r Table-S2}
tableS2 <- individual_lm_results %>%  
  dplyr::filter(dependent_var == "Per-Capita Production" & FDR.p.value < 0.05) %>%
  dplyr::select(-c(dependent_var, p.value))  %>%
  arrange(-logLik)
#write.csv(tableS2, file = "tables/Table-S2.csv", quote = FALSE, row.names = FALSE) 
 
### Per-Capita Production   
datatable(tableS2,
       caption = "Table S2: Ordinary least squares regression statistics for per-capita heterotrophic production with a FDR-corrected p-value of less than 0.05.", 
       options = list(pageLength = 50), rownames = FALSE)    
```

## All Regression Results

```{r all-regressions}
### Per-Capita Production     
datatable(individual_lm_results, 
       options = list(pageLength = 20), rownames = FALSE)    
```


# Bonus Plots 

```{r timeseries, fig.width=6, fig.height=7}
## PLOT THE MEANS
wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:simpson, lakesite, fraction, season)) %>%
  gather(key = "Diversity", value = div_val, richness:simpson) %>%
  mutate(norep_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  dplyr::group_by(fraction, Diversity, season) %>%
  summarize(mean_div = mean(div_val)) %>%
  mutate(connector = paste(Diversity, season, sep = "")) %>%
  ggplot(aes(x = fraction, y = mean_div)) + 
  # connect paired PA and FL points 
  geom_line(aes(group = connector), size = 1.5, alpha = 0.8) +
  geom_point(fill = "grey", size = 3) + 
  #geom_boxplot(alpha = 0.5, outlier.shape = NA) + 
  facet_grid(Diversity~season, scales = "free") + 
  labs(y = "Mean Hill Diversity Metric") +
 # scale_fill_manual(values = fraction_colors) + 
  scale_color_manual(values = c("#5B1A18",  "#FD6467", "#F1BB7B")) + #"#D67236",
  theme(axis.title.x = element_blank())

lines_frac_p <- wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:simpson, lakesite, fraction, season)) %>%
  gather(key = "Diversity", value = div_val, richness:simpson) %>%
  mutate(norep_name = paste(substr(norep_filter_name, 1, 4), substr(norep_filter_name, 6, 8), sep = "")) %>%
  ggplot(aes(x = fraction, y = div_val)) + 
  # connect paired PA and FL points 
  geom_line(aes(group =  norep_name, color = season), size = 1.5, alpha = 0.8) +
  geom_point(aes(shape = lakesite), fill = "grey", size = 3) + 
  #geom_boxplot(alpha = 0.5, outlier.shape = NA) + 
  facet_grid(Diversity~season, scales = "free") + 
  labs(y = "Hill Diversity Metric") +
 # scale_fill_manual(values = fraction_colors) + 
  scale_color_manual(values = c("#5B1A18",  "#FD6467", "#F1BB7B")) + #"#D67236",
  scale_shape_manual(values = lakesite_shapes) +
  theme(axis.title.x = element_blank(), legend.position = "bottom",
        legend.title = element_blank(), legend.box = "vertical",
        legend.spacing.y = unit(0.1, 'cm'))

# center the legend over the plot 
lines_frac_legend <- get_legend(lines_frac_p + theme(legend.direction = "horizontal",legend.justification="center" ,legend.box.just = "bottom"))
plot_grid(lines_frac_p + theme(legend.position = "none"),
          lines_frac_legend, ncol = 1, nrow = 2, 
          rel_heights = c(1, 0.08))

# by season 
wide_div_meta_df %>%
  dplyr::select(c(norep_filter_name:simpson, lakesite, fraction, season)) %>%
  gather(key = "Diversity", value = div_val, richness:simpson) %>%
  mutate(norep_noseason_name = substr(norep_filter_name, 1, 5)) %>%
  ggplot(aes(x = season, y = div_val)) + 
  # connect paired PA and FL points 
  geom_line(aes(group =  norep_noseason_name, color = fraction), size = 1.5, alpha = 0.8) +
  geom_point(aes(shape = lakesite, fill = fraction), size = 3) + 
  #geom_boxplot(alpha = 0.5, outlier.shape = NA) + 
  facet_grid(Diversity~fraction, scales = "free") + 
  labs(y = "Hill Diversity Metric") +
  scale_fill_manual(values = fraction_colors) + 
  scale_color_manual(values = fraction_colors) + #"#D67236",
  scale_shape_manual(values = lakesite_shapes) +
  theme(axis.title.x = element_blank(), legend.position = "none")
```

# Session Information
```{r session-info}
devtools::session_info() # This will include session info with all R package version information
```