Main.rmd

---
output: html_document
editor_options: 
  chunk_output_type: console
---
# Demonstration code
In this code we walk through the analysis for [preprint], building a method for fitting a hierarchical Bayesian longitudinal model to repeat measurement data.

In this analysis we will build simulated datasets of tree size with various growth functions, then fit models to that data. We will also demonstrate the method applied to a real-world dataset. In the course of the investigation we consider the behaviour of different sample sizes, growth functions, and numerical integration methods.

The data files used in the paper are available as a release associated with the repo. We have included the data we use from BCI in the repo itself, which is sampled from a bigger dataset. Details for where to access the raw data is available in the relevant section.

Build file structures for output.
```{r}
if (!file.exists("output")){
  dir.create("output")
  dir.create("output/data")
  dir.create("output/figures")
  dir.create("output/figures/diagnostic")
  dir.create("output/figures/sampled")
  dir.create("output/figures/RungeKuttaDemo")
}
```

Load libraries:
```{r}
library(rstan)
library(rstantools)
library(tidyverse)
library(vioplot)
library(ggbeeswarm)
library(parallel)
library(latex2exp)
library(grid)
library(svglite)
library(mnormt)
```

Now we are ready to start doing things.

First we set the file locations for output and initialise data structures.
```{r}
rstan_data_locations <- list(
  single_species_data = "output/data/SingleSpecies_RStan",
  single_individual_data = "output/data/SingleInd_RStan"
)

data_build_models <- list( #Swap out as required
  exp_model = list(
    model_spec = "R/ExponentialSpec.r",
    model_name = "Exponential",
    model_file = "Exp"
  )
)

data_build_storage_list <- list(
  canham_model = list(
    model_spec = "R/CanhamSpec.r",
    model_name = "Canham",
    model_file = "Canham"
  ),
  
  const_model = list(
    model_spec = "R/ConstSpec.r",
    model_name = "Constant",
    model_file = "Const"
  ),
  
  exp_model = list(
    model_spec = "R/ExponentialSpec.r",
    model_name = "Exponential",
    model_file = "Exp"
  )
)
  
sim_pars <- list(N_ind = 20, #Parameters controlling simulation
                 S0_mean = 2,
                 S0_sd = 1,
                 step_size = 0.1,
                 N_obs = 5, #Observations per individual
                 census_interval = 5, 
                 model = "exp")

sim_pars$time <- seq(from=0, to=(sim_pars$N_obs*sim_pars$census_interval -1), 
                     by=sim_pars$census_interval)

#Control list to be passed to data construction
data_construction_controls <- list(
  data_build_models = data_build_models,
  
  sim_pars = sim_pars,
  
  build_data_single_individual = TRUE,
  
  error_type = "Norm", #Norm for N(0, 0.5) error, Ruger for more complex model
  
  #Specify where data is to be saved and loaded from.
  rstan_data_locations = rstan_data_locations
)

#Change to TRUE to generate data
build_datasets <- TRUE
```

We are ready to build the first multi-individual data sets. These are produced with error added, in a later section we will look at observations without error to investigate integration methods. Datasets built with the Canham function will include a step size parameter which is used to control sub-steps of the Runge-Kutta 4th order integration.
```{r}
if(build_datasets){
  source("R/BuildData.r", local=TRUE)
  
  build_data(data_construction_controls)
}
```

Now we are going to build data sets for the different sample sizes using the constant growth model. Building the largest dataset (10,000 individuals) can take a few minutes.
```{r}
data_build_models_sample_size <- list(
  const_model = list(
    model_spec = "R/ConstSpec.r",
    model_name = "Constant",
    model_file = "Const"
  )
)
  
sim_pars <- list(N_ind = 10, #Parameters controlling simulation
                 S0_mean = 1.5,
                 S0_sd = 1,
                 step_size = 0.1,
                 N_obs = 5, #Observations per individual
                 census_interval = 5,
                 model = "const")
sim_pars$time <- seq(from=0, to=(sim_pars$N_obs*sim_pars$census_interval -1), 
                     by=sim_pars$census_interval)

data_construction_controls <- list(
  data_build_models = data_build_models_sample_size,
  
  sim_pars = sim_pars,
  
  build_data_single_individual = FALSE,
  
  #Specify where data is to be saved and loaded from.
  rstan_data_locations = rstan_data_locations,
  
  error_type="Norm"
)

build_datasets <- TRUE

for(i in c(10, 100, 1000, 10000)){
  data_construction_controls$sim_pars$N_ind <- i
  data_construction_controls$rstan_data_locations$single_species_data <- 
    paste("output/data/", i, "_" , "SingleSpecies_RStan", sep="")
  
  if(build_datasets){ #10 indviduals first
    source("R/BuildData.r", local=TRUE)
    
    build_data(data_construction_controls)
  }
}
```

# Single time series demonstration

Here we will demonstrate the effect of each integration method (Euler, midpoint, Runge-Kutta 4th order) on the size and growth parmeter estimates from a single individual's growth history, projected forward with an exponential growth function.

Load in functions.
```{r}
source("R/RungeKuttaFunctions.r")
```

Set parameters and initial condition
```{r}
beta <- 1.0
S_0 <- 1.0
step_size <- 1.0
n_step <- 5
time <- c(0, 1, 2, 3, 4)

#Vectors of data
est_names <- c("Euler", "Midpoint", "RK4")
colours <- c("blue", "red", "green4", "black") #Default for non-error models
```

Numerical integration without MCMC or error, which requires knowing the value of beta.
```{r}
growth <- function(size, pars){
  return(pars[1]*size)
}
#Euler method
euler_int <- euler_est(S_0, growth, pars=c(beta), step_size, nstep)

#Midpoint method
midpoint_int <- midpoint_est(S_0, growth, pars=c(beta), step_size, nstep)

#Runge-Kutta 4th order
runge_kutta_int <- rk4_est(S_0, growth, pars=c(beta), step_size, nstep)
```
Plot the numerical integration projections.

```{r}
lines_list <- list(euler_int, midpoint_int, runge_kutta_int)
points_list <- lines_list
names <- c(est_names, "Function")
lty_list <- c(1,1,1,1)
pch_list <- c(3,4,5,NA)
title <- "Numerical Integration"

#Produce plot
plot_estimates(lines_list, points_list, names, colours, 
               pch_list, lty_list, title, lwd=2)

#Save output to file
file_name <- "output/figures/RungeKuttaDemo/Single_NumericalIntegration.svg"
save_plot_of_estimates(lines_list, points_list, names, colours, pch_list, 
                       lty_list, title, file_name, lwd=2)
```

In this instance we will use the analytic solution to construct the data rather than using a forward projection with a numerical method as we did above.

Build base observation data
```{r}
N_obs <- n_step
census <- c(1, 2, 3, 4, 5)
S_obs <- S_0  * exp(beta*time)
census_interval <- rep(step_size, times=5) #Step size

rstan_data <- build_rstan_data(N_obs, S_obs, census, census_interval)
```

Now we construct a data set with measurement error.
```{r}
S_obs_err <- c()
for(i in 1:length(S_obs)){
  S_obs_err[i] <- add_error(S_obs[i], "Norm") #Implementing error model
}

#Alternate implementation
S_obs_err <- S_obs + rnorm(length(S_obs), 0, 0.5)

rstan_data_error <- build_rstan_data(N_obs, S_obs_err, census, census_interval)
```

From here we run the Bayesian models 
```{r}
model_outputs_single <- list()
```

MCMC estimation from analytic solution to demonstrate the different integration methods. As this will run chains in sequence it may take a few minutes. Output plots can be seen in output/figures/RungeKuttaDemo/.
```{r}
model_outputs_single$MCMC <- build_output(rstan_file = "stan/Exp_SingleIndividual.stan", 
                                   rstan_data = rstan_data,
                                   est_names, 
                                   est_method = "samp",
                                   int_methods = c(1,2,3),
                                   title = "MCMC Estimation",
                                   file_name = "output/figures/RungeKuttaDemo/Single_MCMCIntegration.svg",
                                   names = c(est_names, "Function"),
                                   colours,
                                   lty_list <- c(1,1,1,1),
                                   pch_list <- c(3,4,5,NA),
                                   estplot = TRUE)
```

MCMC estimation with measurement error.
```{r}
model_outputs_single$MCMC_Error <- build_output(rstan_file = "stan/Exp_SingleIndividual.stan", 
                                         rstan_data = rstan_data_error,
                                         est_names, 
                                         est_method = "samp",
                                         int_methods = c(1,2,3),
                                         title = "MCMC Estimation With Error",
                                         file_name = "output/figures/RungeKuttaDemo/Single_MCMCIntegration_Error.svg",
                                         names = c(est_names, "Function"),
                                         colours,
                                         lty_list <- c(1,1,1,1),
                                         pch_list <- c(3,4,5,NA),
                                         estplot = TRUE)
```

To investigate the numerical integration behaviour fully we repeat the error simulation and model fitting. 
Warning: The following block will take a long time to run! You can reduce the number of iterations required using the n_iter argument.
```{r}
n_iter = 100
size_2_est <- tibble()
beta_est <- tibble()
model <- stan_model(file="stan/Exp_SingleIndividual.stan")

for(i in 1:n_iter){
  #Add error to analytic solution
  S_obs_err <- abs(S_obs + rnorm(length(S_obs), 0, 0.5))
  rstan_data_error <- build_rstan_data(N_obs, S_obs_err, census, census_interval)
  
  #Iterate through the integration methods getting estimates from each one.
  for(j in 1:3){
    fit <- sampling(model, data=rstan_data_error[[j]], 
                    iter=2000,
                    chains=4,
                    cores=4)
    
    est_data <- rstan::extract(fit, permuted=TRUE)
    size_temp <- tibble(int_method = j, iter = i, size_obs = S_obs[2],
                        size_est = apply(est_data$S_hat, 2, mean)[2])
    beta_temp <- tibble(int_method = j, iter = i, 
                        beta = mean(est_data[["ind_beta"]]))
    
    size_2_est <- rbind(size_2_est, size_temp)
    beta_est <- rbind(beta_est, beta_temp)
  }
}

boxplot(beta_est$beta ~ beta_est$int_method, xlab = "Integration Method", ylab="Beta",
        xaxt = "n", col=c("blue3", "red", "green4"), main="Estimates of Beta")
abline(h=1, col="grey18")
axis(1, at = c(1,2,3), labels = c("Euler", "Midpoint", "RK4"))

boxplot(size_2_est$size_est ~ size_2_est$int_method, xlab = "Integration Method", ylab="S(2)",
        xaxt = "n", col=c("blue3", "red", "green4"), main="Estimates of S(2)")
abline(h=S_obs[2], col="grey18")
axis(1, at = c(1,2,3), labels = c("Euler", "Midpoint", "RK4"))


```

# BCI data demonstration
In this section we will fit a Canham growth model to a sample of individuals from Garcinia recondita in the Barro Colorado Island forest plot data:
Condit, R., S. Lao, R. Pérez, S. Aguilar, R.B. Foster, S.P. Hubbell. (2019). Complete data from the Barro Colorado 50-ha plot: 423617 trees, 35 years, v3, DataONE Dash, Dataset, https://doi.org/10.15146/5xcp-0d46.

A simple random sample without replacement was used to select 400 individuals with 6 observations. The RStan data file has been constructed and is stored as gar2in_rstan_data.rds in the data folder.

```{r}
run_model_BCI_example <- TRUE

if(run_model_BCI_example){
  source("R/CanhamSpec.r", local=TRUE)
  source("R/RunModels.r", local=TRUE)

  dataset <- readRDS("data/gar2in_rstan_data.rds")
  
  fit <- build_rstan_model(stan_controls = single_species_controls$stan_controls, 
                                model_name = single_species_controls$model_name,
                                stan_file_name = single_species_controls$stan_file_path,
                                est_method = "samp",
                                dataset) #Index 3 used to fit into the existing functions
  
  filename <- paste(single_species_controls$fit_save_path, "GRecondita", sep="_")
  filename <- paste(filename, ".rds", sep="")
  saveRDS(fit, file=filename) #Output to file
  
  #Wrangle file names for diagnostic plots
  diag_list <- single_species_controls$diagnostic_list
  for(j in 1:length(diag_list)){
    diag_list[[j]]$name <- paste(diag_list[[j]]$name, "GRecondita", sep="_")
  }
  #Plot diagnostics
  plot_all_diagnostics(fit, int_method = 3, 
                       diag_list,
                       inc_warmup = FALSE,
                       RHat_name = "GRecon")
  
  #Further diagnostics
  #Initialise data frame for Rhat diagnostics.
  Rhat_data <- data.frame(row.names=names(fit@sim[["samples"]][[1]]))
  #Extract values to data frame
  for(i in 1:length(fit@sim[["samples"]])){
    for(j in 1:length(fit@sim[["samples"]][[1]])){
      Rhat_data[j,i] <- Rhat(fit@sim[["samples"]][[i]][[j]])
    }
  }
  #Get a histogram of all the RHats
  histvec <- c()
  for(i in 1:ncol(Rhat_data)){ histvec <- c(histvec, Rhat_data[,i]) }
  hist(histvec, col="lightblue", main="Histogram of Rhats for all chains", xlab="Rhat")
  length(which(histvec > 1.05))
  
  saveRDS(fit, file="output/data/Single_Species_Fit_Canham_GRecondita.rds")
  rm(fit)
}
```

Once the model has been fit and the diagnostics do not indicate problems, we can extract the fitted values and analyse them.
```{r}
extract_analyse <- TRUE

if(extract_analyse){
  #Load in files
  fit <- readRDS("output/data/Single_Species_Fit_Canham_GRecondita.rds")
  dataset <- readRDS("data/gar2in_rstan_data.rds")
  dataset$int_method <- 3
  
  source("R/EstimateWrangling.r", local=TRUE)
  source("R/Analyse.r", local=TRUE)
  source("R/CanhamSpec.r", local=TRUE)
  
  #Sample extraction and analysis
  #Building data frames
  measurement_data <- tibble(treeid_factor = dataset$treeid_factor,
                             S_obs = dataset$S_obs,
                             census = dataset$census,
                             census_interval = dataset$census_interval)
  measurement_data <- measurement_data %>% 
    group_by(treeid_factor) %>%
    mutate(treeid = dataset$tree_id_vec[treeid_factor],
           time=cumsum(census_interval)) %>%
    ungroup()
  
  individual_data <- tibble(treeid = dataset$tree_id_vec,
                            treeid_factor = 1:400)
  
  #Extract samples
  est_data <- rstan::extract(fit, permuted=TRUE)
  
  #Measurement-level data
  measurement_data$S_hat <- apply(est_data$S_hat, 2, mean)
  measurement_data$G_hat <- apply(est_data$G_hat, 2, mean)
  
  #Individual-level data
  for(i in ind_pars){
    individual_data[[ind_pars_names[which(ind_pars == i)]]] <- apply(est_data[[i]], 2, mean)
  }
  
  #Hist/boxplot combos
  hist_names <- c("Max growth - g_max", "Size at max growth - S_max", "Spread - k")
  for(i in 1:length(growth_par_names)){
    layout(mat = matrix(c(1,2),2,1, byrow=TRUE),  height = c(2,8))
    
    par(mar=c(0, 3.1, 1.1, 2.1))
    boxplot(individual_data[[growth_par_names[i]]] , horizontal=TRUE ,
            xaxt="n" ,
            col="palegreen3" , frame=F)
    par(mar=c(4, 3.1, 1.1, 2.1))
    hist(individual_data[[growth_par_names[i]]] , breaks=10 ,
         col="palegreen" , main="" ,
         xlab=hist_names[i])
  }
  
  #Get final size for each individual
  temp <- measurement_data %>%
    group_by(treeid) %>%
    mutate(S_final = max(S_hat)) %>%
    distinct(treeid, S_final) %>%
    ungroup() %>%
    select(S_final)

  individual_data$S_final <- temp$S_final
  
  #Compare individual-level parameters
  pairs(~individual_data$canham_max_growth_hat + 
          individual_data$canham_diameter_at_max_growth_hat +
          individual_data$canham_K_hat,
        labels = c("g_max", "S_max", "K"))
  
  #Large-scale plot of single pair
  pairplot <- ggplot(data=individual_data, aes(x,y))+
    geom_point(aes(x=canham_max_growth_hat, y=canham_K_hat), 
               size=2, color="green4", alpha=0.6) +
    labs(x = "g_max", y = "k") +
    theme_classic()
  
  #Species-level data
  species_par_est <- c()
  for(i in 1:length(sp_pars)){ #Extract estimates
    species_par_est[i] <- mean(est_data[[sp_pars[i]]])
  }
  species_data <- tibble(species_par_est, sp_pars_names, Lower=c(), Upper=c())
  #Build CIs
  hyperpar_CI <- build_par_CI(est_data, sp_pars, sp_pars_names)
  for(i in 1:length(hyperpar_CI)){
    species_data$Lower[i] <- hyperpar_CI[[i]][[1]]
    species_data$Upper[i] <- hyperpar_CI[[i]][[2]]
  }
  
  #Save data to file
  save_data <- list(individual_data = individual_data,
                    measurement_data = measurement_data,
                    species_data = species_data)
  save_filename <- "output/data/GRecondita_Sampled_Canham_CompiledData.rds"
  saveRDS(save_data, file=save_filename)
  
  #Load if needed
  save_data <- readRDS("output/data/GRecondita_Sampled_Canham_CompiledData.rds")
  individual_data <- save_data$individual_data
  measurement_data <- save_data$measurement_data
  species_data <- save_data$species_data
  
  #Plots of individual sizes over time, estimated and observed
  n_ind <- 400 #Number of plots to do
  plot_obs_and_est_life_history(measurement_data, n_ind, "Canham")
  
  #Scatter plot of individual R^2 values against total estimated growth
  r_sq_summary <- measurement_data %>%
    group_by(treeid_factor) %>%
    summarise(
      total_est_growth = max(S_hat) - min(S_hat),
      r_sq = cor(S_hat, S_obs)^2
    )
  file_name <- "output/figures/RSq_Growth_Scatter.png"
  r_sq_scatter <- ggplot(data = r_sq_summary, aes(x=total_est_growth, y=r_sq))+
    geom_point(colour = "green4", alpha=0.5) +
    labs(x = "Total est. growth (cm)",
         y = "R-sq: obs. and est. sizes")+
    theme_classic() +
    theme(axis.text=element_text(size=12),
          axis.title=element_text(size=14,face="bold"))
  ggsave(file_name, plot=r_sq_scatter, width=100, height=100, units="mm")
  
  #Number of individuals with R^2 < 0.75
  sum(r_sq_summary$r_sq <0.75)
  sum(r_sq_summary$r_sq <0.25)
  
  #Maximum growth for individuals with R^2 < 0.75
  max(r_sq_summary$total_est_growth[which(r_sq_summary$r_sq <0.75)])
  
  #Total sample R^2
  cor(measurement_data$S_obs, measurement_data$S_hat)^2
  
  #Plots of sizes over time, and growth over size, for all individuals
  plot_est_life_history(measurement_data, model_name="Canham")
  
  #Plot of individual size over time based on percentiles of individual parameters
  ind_pars_est <- c(g_max <- quantile(individual_data$canham_max_growth_hat, 0.5),
                       S_max <- quantile(individual_data$canham_diameter_at_max_growth_hat, 0.5),
                       k <- quantile(individual_data$canham_K_hat, 0.5))
  pop_pars_est <- c(g_max_mean <- exp(species_data$species_par_est[1]),
                     S_max_mean <- exp(species_data$species_par_est[3]),
                     k_mean <- exp(species_data$species_par_est[5]))
  
  S_0 <- rep(3, n_ind) #Start at 3cm in size to avoid getting stuck at 1cm
  step_size <- 0.1 #Precision of integration method
  lifespan <- 300 #How long to project forward tree size.
  
  plot_pair_life_history(pop_pars_est, ind_pars_est, growth_function, 
                    name = "BCIAvg", 
                    S_0, step_size, lifespan)
  
  #Plot growth functions for all individuals.
  n_ind <- 400
  growth_par_ests <- data.frame(g_max = individual_data$canham_max_growth_hat, 
                                s_max = individual_data$canham_diameter_at_max_growth_hat, 
                                k = individual_data$canham_K_hat)
  
  #Plot the growth function histories starting at 3cm
  plot_sample_life_history(growth_par_ests, growth_function, 
                           name = "GRecondita_EstimatedPars", 
                           n_ind, S_0, step_size=step_size, lifespan=lifespan)
  
  #Plot the growth function histories starting at S_0
  plot_sample_life_history(growth_par_ests, growth_function, 
                           name = "GRecondita_From_S_0", 
                           n_ind, 
                           S_0 = individual_data$canham_S_0_hat, 
                           S_final = individual_data$S_final,
                           step_size, lifespan = 150,
                           min_growth_size = min(individual_data$canham_S_0_hat),
                           max_growth_size = 100)
  
  plot_sample_life_history(growth_par_ests, growth_function, 
                           name = "GRecondita_Observed", 
                           n_ind, 
                           S_0 = individual_data$canham_S_0_hat, 
                           S_final = individual_data$S_final,
                           step_size = step_size, 
                           lifespan = 150)
}
```

For a direct comparison to Herault (2011) we fit a second G. recondita model. This one uses pairwise differences as point estimates of growth independent of the repeat measurement structure.
```{r}
#Build dataset from pairwise differences
source("R/RunModels.r")
dataset <- readRDS("data/gar2in_rstan_data.rds")
temp_obs <- tibble(
  S_obs = dataset$S_obs,
  treeid_factor = dataset$treeid_factor,
  census = dataset$census,
  census_interval = dataset$census_interval
  ) %>%
  group_by(treeid_factor) %>%
  mutate(delta_S_obs = lead(S_obs) - S_obs) %>%
  ungroup() %>%
  select(S_obs, delta_S_obs, census_interval) %>%
  drop_na()
      
species_rstan_data <-
  list(
    N_obs = nrow(temp_obs),
    S_obs = temp_obs$S_obs,
    delta_obs = temp_obs$delta_S_obs,
    census_interval = temp_obs$census_interval
  )

#Build model
plot_diagnostics_species_level <- TRUE
model <- "canham"
        
sp_model <- stan_model(file="stan/Canham_SpeciesLevel.stan")
sp_model_start <- Sys.time()
print(paste("Fitting species-level Canham model: ", sp_model_start, sep=""))
species_only_fit <- sampling(sp_model, data=species_rstan_data, 
                             iter=2000,
                             chains=4,
                             cores=4)
sp_model_end <- Sys.time()
print(paste("Species-level Canham model end: ",sp_model_end, sep=""))
#warnings()

if(plot_diagnostics_species_level){
  plot_diagnostic_trace(species_only_fit, 3, c("species_max_growth", 
                                               "species_diameter_at_max_growth", 
                                               "species_K"), 
                        paste("GRecondita", model, "species_model_pars", sep="_"), FALSE)
}

#Save fit
saveRDS(species_only_fit, "output/data/GRecondita_SpeciesLevelFit.rds")
rm(species_only_fit)
```

To compare the species-level model to the results from the hierarchical set-up, we plot the average trajectory of the species-level model over the individual trajectories.
```{r}
source("R/Analyse.r")

#Import individual-level parameters
individual_data <- readRDS("output/data/GRecondita_Sampled_Canham_CompiledData.rds")$individual_data

#Extract species level parameters
species_only_fit <- readRDS("output/data/GRecondita_SpeciesLevelFit.rds")
sp_est_data <- rstan::extract(species_only_fit, permuted=TRUE)
species_growth_pars <- tibble(
  g_max = mean(sp_est_data$species_max_growth),
  S_max = mean(sp_est_data$species_diameter_at_max_growth),
  k = mean(sp_est_data$species_K),
  S_0 = 1,
  S_final = max(individual_data$S_final), #Get maximum size
  fit_model = "Species-level model"
)

ind_growth_pars <- tibble(
  g_max = individual_data$canham_max_growth_hat,
  S_max = individual_data$canham_diameter_at_max_growth_hat,
  k = individual_data$canham_K_hat,
  S_0 = individual_data$canham_S_0_hat,
  S_final = individual_data$S_final, #Get vector of final 
  fit_model = "Individual fits"
)

plot <- ggplot() +
    xlim(1, max(individual_data$S_final)) +
    xlab("Size (cm)") +
    ylab("Growth rate (cm/yr)") +
    theme_classic() +
    theme(axis.text=element_text(size=16),
          axis.title=element_text(size=18,face="bold"))
  
for(i in 2:nrow(ind_growth_pars)){
  args_list <- list(pars=ind_growth_pars[i,1:3])
  plot <- plot +
    geom_function(fun=growth_function, args=args_list, alpha=0.2, 
                  linewidth=1, xlim=c(ind_growth_pars$S_0[i], ind_growth_pars$S_final[i]),
                  color = "green4", linetype = "solid")
}

plot <- plot +
  geom_function(fun=growth_function, 
                args=list(pars=ind_growth_pars[1,1:3]), 
                alpha=0.2, 
                linewidth=1,
                aes(col = ind_growth_pars$fit_model[1],
                    linetype = ind_growth_pars$fit_model[1]),
                xlim=c(ind_growth_pars$S_0[1], ind_growth_pars$S_final[1])) +
  geom_function(fun=growth_function, 
                args=list(pars=species_growth_pars[1,1:3]), 
                alpha=1, 
                linewidth=1.2,
                aes(col = species_growth_pars$fit_model[1],
                    linetype = species_growth_pars$fit_model[1]),
                xlim=c(0.5, max(individual_data$S_final))) +
  scale_color_manual(values = c("green4", "black"),
                     name = "Parameter source") +
  scale_linetype_manual(values = c("solid", "dashed"),
                        name = "Parameter source") +
  theme(legend.position = "inside",
        legend.position.inside = c(0.7,0.8),
        legend.title = element_text(size=14),
        legend.text = element_text(size=12),
        legend.key.width = unit(1.2, 'cm'),
        axis.text=element_text(size=12),
        axis.title=element_text(size=14)) 

plot
```

As part of the method analysis we want to check what the out-of-sample predictive power is like, compared to the ability to predict sizes within the sample. To examine this we look at the ability of models fit to the first 5 observations with regards to their capacity to extrapolate to a 6th size that we can compare to the observed 6th size. 

We are going to build 2 more models: a hierarchical model with Canham growth function fit to 5 observations per individual, and a hierarchical model with constant growth function and individual level average growth parameters fit to 5 observations. The constant growth model is directly analogous to a linear mixed effects model for sizes over time with individual effect term. 

The hierarchical models fit to 5 observations will estimate the 6th size based off of the estimated 5th size and forward projection of the growth curve with the individual growth parameters and the census interval between the 5th and 6th observations. The hierarchical model fit to 6 observations directly estimates the 6th size.
```{r}
fit_5_obs <- TRUE

if(fit_5_obs){
  source("R/RunModels.r", local=TRUE)
  
  #Select model parameters
  stan_controls = list(
    iter = 2000,
    chains = 4,
    cores = 4,
    control = list(max_treedepth = 20)
  )
  
  #Fit Canham hierarchical model to 5 observations
  #This fit takes a while, expect 50+ hours.
  source("R/CanhamSpec.r")
  dataset <- readRDS("data/gar2in_5_rstan_data.rds")
  fit <- build_rstan_model(stan_controls,
                           model_name = "Canham",
                           stan_file_name = "stan/Canham_SingleSpecies.stan",
                           est_method = "samp",
                           rstan_data = dataset)
  saveRDS(fit, file="output/data/Single_Species_Fit_Canham_GRecondita_5obs.rds")
  #Wrangle file names for diagnostic plots
  diag_list <- single_species_controls$diagnostic_list
  for(j in 1:length(diag_list)){
    diag_list[[j]]$name <- paste(diag_list[[j]]$name, "GRecondita_5obs", sep="_")
  }
  #Plot diagnostics
  plot_all_diagnostics(fit, int_method = 3, 
                       diag_list,
                       inc_warmup = FALSE,
                       RHat_name = "GRecon_Canham_5obs")
  rm(fit)
  
  #Fit constant growth model to 5 observations
  dataset <- readRDS("data/gar2in_5_rstan_data.rds")
  source("R/ConstSpec.r")
  fit <- build_rstan_model(stan_controls,
                           model_name = "Constant",
                           stan_file_name = "stan/ConstGrowth_SingleSpecies.stan",
                           est_method = "samp",
                           rstan_data = dataset)
  saveRDS(fit, file="output/data/Single_Species_Fit_Const_GRecondita_5obs.rds")
  #Wrangle file names for diagnostic plots
  diag_list <- single_species_controls$diagnostic_list
  for(j in 1:length(diag_list)){
    diag_list[[j]]$name <- paste(diag_list[[j]]$name, "GRecondita_5obs", sep="_")
  }
  #Plot diagnostics
  plot_all_diagnostics(fit, int_method = 3, 
                       diag_list,
                       inc_warmup = FALSE,
                       RHat_name = "GRecon_Const_5obs")
  rm(fit)
}
```

The next step involves extracting estimated final sizes from different models to compare RMSE. We will look at the hierarchical Canham model fit to 6 observations as a baseline in-sample RMSE, and compare that to each of the three models fit to the first 5 observations. For the hierarchical models (Canham and constant growth) we use the estimated 5th size as the starting point for extrapolation to the 6th. For the species-average model we use the observed 5th size instead.
```{r}
extract_predictive_RMSE <- TRUE

if(extract_predictive_RMSE){
  #Extract size estimates from models
  dataset_all_obs <- readRDS("data/gar2in_rstan_data.rds")
  tibble_6th_obs <- tibble(
   S_obs = dataset_all_obs$S_obs,
   treeid_factor = dataset_all_obs$treeid_factor,
   census = dataset_all_obs$census,
   census_interval = dataset_all_obs$census_interval
  ) %>%
    filter(census == 6)
  
  fit_save_paths <- c(
    "output/data/Single_Species_Fit_Canham_GRecondita_5obs.rds",
    "output/data/Single_Species_Fit_Const_GRecondita_5obs.rds",
    "output/data/Single_Species_Fit_Canham_GRecondita.rds"
  )
  
  fit_name <- c(
    "hierarchical_canham_5_obs",
    "hierarchical_const_5_obs",
    "hierarchical_canham_6_obs"
  )
  
  fit_title <- c(
    "Hierarchical Canham fit to 5 obs.",
    "Hierarchical constant fit to 5 obs.",
    "Hierarchical Canham fit to 6 obs."
  )
  
  model_spec_name_vec <- c(
    "R/CanhamSpec.r",
    "R/ConstSpec.r",
    "R/CanhamSpec.r"
  )

  dataset <- readRDS("data/gar2in_5_rstan_data.rds")
  
  source("R/EstimateWrangling.r")
  pars_list <- list()
  starting_size <- tibble()
  
  for(i in 1:length(fit_name)){
    fit <- readRDS(fit_save_paths[i])
    print(paste0("Wrangling fit: ", fit_name[i]))
    
    temp_return_data <- extract_est_tibble(fit, 
                               model_name = fit_name[i], 
                               model_spec_name = model_spec_name_vec[i], 
                               dataset, 
                               S_6_obs = tibble_6th_obs$S_obs, 
                               census_interval = tibble_6th_obs$census_interval)
    
    starting_size <- rbind(starting_size, temp_return_data$starting_size)
    pars_list <- c(pars_list, temp_return_data$par_list)
  }
  
  starting_size$S_final_hat <- 0
  source("R/Analyse.r")
  #Estimate size for 6th census based on RK4 projection forward from 5th size estimate
  for(i in 1:nrow(starting_size)){
    if((i-1) %% 50 == 0){
      print(paste0("Row: ", i))
    }
    source(starting_size$model_spec_name[i])
    #Estimation from individual model
    pars <- pars_list[[i]]$pars
    
    temp_size_est <- rk4_est(S_0 = starting_size$S_5[i], 
            growth = growth_function, 
            pars = pars, 
            step_size = starting_size$census_interval[i]/5, 
            N_step = 5) 
    
    starting_size$S_final_hat[i] <- temp_size_est[5]
  }
  
  #Calculate error
  starting_size$S_final_error <- starting_size$S_final_hat - starting_size$S_6_obs
  par(mfrow = c(3,1))
  #Histogram of errors
  for(i in 1:length(fit_name)){
    temp_size <- starting_size %>%
      filter(model_name == fit_name[i])
    
    rmse <- sqrt(sum(temp_size$S_final_error^2)/nrow(temp_size))
    
    hist(temp_size$S_final_error, 
         main = paste0(fit_title[i], " RMSE: ", signif(rmse, digits = 3)),
         xlab = "Error est - obs (cm)",
         xlim = c(min(starting_size$S_final_error), max(starting_size$S_final_error)),
         col = "lightblue")
    
    print(paste0("RMSE for ", fit_name[i], " is: ", rmse))
  }
  
  par(mfrow = c(1,1))
}
```

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                     Multi-individual demonstration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In this section we will expand our simulation analysis to multiple individuals so we are able to compare the performance of integration methods for species-level parameters, and across individuals.

First we specify which models to run. 
As written, this code can use MCMC to fit each of constant, exponential, and Canham models to simulated data from those respective growth functions at the multi-species level using the Euler integration method. We will work with the Canham model for the paper.

To change integration method, modify the int_method parameter. 
To modify the number of cores, chains, and samples go to the single_species_controls object in the relevant spec file.
To exclude particular models remove them from the model_list.

Model fits will be saved to output/data/ as .rds files. In the next section the samples will be extracted and transformed into something usable.
```{r}
rstan_data_locations <- list(
  single_species_data = "output/data/SingleSpecies_RStan",
  single_individual_data = "output/data/SingleInd_RStan"
)

model_list <- list(
  exp_model = list( #The exponential model uses the census interval as the integration step size for numerical methods
    model_spec = "R/ExponentialSpec.r",
    model_name = "Exponential",
    model_file = "Exp"
  ))#,
  
storage_list <- list(
  canham_model = list(#This model requires a step size argument in the data passed to it.
    model_spec = "R/CanhamSpec.r",
    model_name = "Canham",
    model_file = "Canham",
    step_size = 1
  ),
  
  const_model = list( #All numerical methods are equivalent for constant growth
    model_spec = "R/ConstSpec.r",
    model_name = "Constant",
    model_file = "Const"
  ),
  
  exp_model = list( #The exponential model uses the census interval as the integration step size for numerical methods
    model_spec = "R/ExponentialSpec.r",
    model_name = "Exponential",
    model_file = "Exp"
  )
)
  
#Specify which hierarchy of model to build
level_of_models <- list(
  single_species = TRUE,
  single_individual = FALSE
)

#Choose "opt" for optimizing() or "samp" for MCMC. This will apply to all models.
est_method <- "samp" 

#Choose 1: Euler, 2: Midpoint, 3: RK4. This will apply to all models.
int_method <- 3 

#Change to TRUE to run models
run_models <- TRUE
```

Now we construct the model building controls.
```{r}
model_build_controls <- list(
  model_list = model_list,
  
  level_of_models = level_of_models,
  
  rstan_data_locations = rstan_data_locations,
  
  plot_diagnostics = TRUE,
  
  est_method = est_method,
  
  int_method = int_method,
  
  inc_warmup = FALSE #Include warmup samples in diagnostic plots
)
```

The models are ready to run.

Depending on the integration method, model, chains, and cores this step can take from a few minutes to a few hours. Example fit files are provided if you wish to skip the computationally intensive bit. Running the files as given with chains in sequence rather than parallel takes about an hour.
```{r}
if(run_models){ 
  source("R/RunModels.r", local=TRUE)
  
  build_all_models(model_build_controls)
}
```

Once the fits are run we can extract the estimates and build output.
```{r}
#Sample extraction controls
extract_samples <- FALSE

#Control list to be passed to sample extraction
data_extract_controls <- list(
  model_list = model_list,
  
  level_of_models = level_of_models,
  
  rstan_data_locations = rstan_data_locations,
  
  est_method = est_method,
  
  int_method = int_method
)

#Analysis controls
analyse_models <- TRUE

analysis_controls <- list(
  model_list = model_list,
  
  est_method = est_method,
  
  int_method = int_method
)

#Run extraction and analysis
if(extract_samples){
  source("R/EstimateWrangling.r", local=TRUE)
  
  build_extracted_samples(data_extract_controls)
}

if(analyse_models){
  source("R/Analyse.r", local=TRUE)
  
  model_analysis(analysis_controls)
}
```

The compiled data files can be loaded in and examined to see the CIs for species-level estimates, individual-level parameter estimates, and size-level values. Further analysis can be done on the extracted values.