Skip to content

Commit

Permalink
Added simulation analysis code and simulated individual data.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tess-LaCoil committed Aug 26, 2024
1 parent ce331e1 commit 5a7666f
Show file tree
Hide file tree
Showing 2 changed files with 399 additions and 0 deletions.
399 changes: 399 additions & 0 deletions Multiple_simulation_analysis.r
Original file line number Diff line number Diff line change
@@ -0,0 +1,399 @@
# Code to extract results from simulations and corresponding model fits.
# Assumes that the saved fits are stored in a directory called input
# with specific naming structure.

#Load in model fits, extract info, export to data structures
{
library(tidyverse)
library(lubridate)
library(rstan)
library(grid)
library(ggridges)
library(cowplot)
}

structure <- "25Yr_ObsPeriod_6Obs"
error_type <- "Ruger" #"Norm"
batches <- 1100

#simulated data available
sim_data_list <- readRDS("data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds")

sp_pars <- c("species_max_growth_mean",
"species_max_growth_sd",
"species_diameter_at_max_growth_mean",
"species_diameter_at_max_growth_sd",
"species_K_mean",
"species_K_sd")
sp_pars_true_vals <- c(mean(log(sim_data_list$sim_ind_info$g_max)),
sd(log(sim_data_list$sim_ind_info$g_max)),
mean(log(sim_data_list$sim_ind_info$s_max)),
sd(log(sim_data_list$sim_ind_info$s_max)),
mean(log(sim_data_list$sim_ind_info$k)),
sd(log(sim_data_list$sim_ind_info$k)))
ind_par_names <- c("ind_max_growth", "ind_diameter_at_max_growth", "ind_K")
growth_function_pars <- c("g_max", "s_max", "k")

#Diagnostics
diag_data <- read.csv(paste0("input/",error_type,"Diag.csv"))
diag_data_reduced <- diag_data %>%
filter(Rhat > 1.05, DivTrans >50)

#Exclusion lists come from the diag_data_reduced frame with double checking
#exclude <- c(192, 227, 85, 888, 910) #Norm
#exclude <- c(1013, 523, 655, 657) #Ruger

load_data <- FALSE
if(load_data){
#Initialise data structures
error_reduction_data <- tibble()
ind_data <- tibble()
species_data <- tibble()
error_par_data <- tibble()
sim_data <- tibble()

for(i in 1:batches){ #Iterate through simulations
if((i-1) %% 25 == 0){
print(paste0("Model: ", i))
}

filename <- paste0("input/fits/Fit_", error_type, "_", i, "_1_25Yr_ObsPeriod_6Obs_FullModel.rds")
#Load model if it exists, otherwise create dummy row
if(!file.exists(filename) || (i %in%exclude)){
temp_sim_data <- tibble(
batch = i,
start_time = 0,
end_time = 0,
runtime = -1
)

sim_data <- rbind(sim_data, temp_sim_data)
} else {
model_save <- readRDS(filename)
sim_data_list <- readRDS("data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds")

if(length(model_save$error_save_data)>0){
sim_data_list$sim_data$S_obs <- model_save$error_save_data$S_obs
sim_data_list$sim_data <- sim_data_list$sim_data %>%
group_by(treeid) %>%
arrange(time) %>%
mutate(delta_S_obs = lead(S_obs) - S_obs) %>%
ungroup() %>%
arrange(treeid, time)

#Extract samples
est_data <- rstan::extract(model_save$fit, permuted=TRUE)

#Extract error parameter data
error_par_temp <- tibble(
batch = i,
sigma_e_mean = mean(est_data$global_error_sigma),
sigma_e_median = median(est_data$global_error_sigma),
sigma_e_CI_lower = quantile(est_data$global_error_sigma, 0.025),
sigma_e_CI_upper = quantile(est_data$global_error_sigma, 0.975)
)
error_par_data <- rbind(error_par_data, error_par_temp)

#Extract size and growth measurement error reduction
size_data <- tibble(
true_size = sim_data_list$sim_data$S_true,
obs_size = sim_data_list$sim_data$S_obs,
ind_fit = apply(est_data$S_hat, 2, mean)
)

size_data <- size_data %>%
mutate(
obs_diff = obs_size - true_size,
ind_diff = ind_fit - true_size,
)

growth_data <- tibble(
true_growth = sim_data_list$sim_data$delta_S,
obs_growth = sim_data_list$sim_data$delta_S_obs,
ind_fit = apply(est_data$G_hat, 2, mean)
)

growth_data <- growth_data %>%
mutate(
obs_diff = obs_growth - true_growth,
ind_diff = ind_fit - true_growth
) %>%
drop_na()

temp_error_reduction_data <- tibble(
batch = i,
error_type = error_type,
obs_size_RMSE = sqrt(sum(size_data$obs_diff^2)/nrow(size_data)),
ind_size_RMSE = sqrt(sum(size_data$ind_diff^2)/nrow(size_data)),
obs_growth_RMSE = sqrt(sum(growth_data$obs_diff^2)/nrow(growth_data)),
ind_growth_RMSE = sqrt(sum(growth_data$ind_diff^2)/nrow(growth_data)),
size_RMSE_percent_reduction = (1-(ind_size_RMSE/obs_size_RMSE))*100,
growth_RMSE_percent_reduction = (1-(ind_growth_RMSE/obs_growth_RMSE))*100
)

error_reduction_data <- rbind(error_reduction_data, temp_error_reduction_data)

#Extract individual parameter estimates
for(j in 1:length(ind_par_names)){
#Get percentage of ind_par CIs that contains true value, par estimate
ind_CI_contain_true <- c()
for(k in 1:50){
CI_lower <- as.numeric(quantile(est_data[[ind_par_names[j]]][,k], probs=0.025))
CI_upper <- as.numeric(quantile(est_data[[ind_par_names[j]]][,k], probs=0.975))
true_val <- sim_data_list$sim_ind_info[[growth_function_pars[j]]][k]

ind_CI_contain_true[k] <- (true_val >= CI_lower) && (true_val <= CI_upper)

ind_par_temp <- tibble(batch = i,
treeid = k,
par_name = ind_par_names[j],
true_val = true_val,
est = mean(est_data[[ind_par_names[j]]][,k]),
ci_lower = CI_lower,
ci_upper = CI_upper)
ind_par_temp$ind_ci_contain_true <- ind_CI_contain_true[k]
ind_par_temp$error <- ind_par_temp$est - ind_par_temp$true_val
ind_data <- rbind(ind_data, ind_par_temp)
}
}

#Extract species-level estimates
for(j in 1:length(sp_pars)){
temp_species_data <- tibble(
batch = i,
error_type = error_type,
par_name = sp_pars[j],
par_true_val = sp_pars_true_vals[j],
par_est = mean(est_data[[sp_pars[j]]]),
par_ci_lower = as.numeric(quantile(est_data[[sp_pars[j]]], probs=0.025)),
par_ci_upper = as.numeric(quantile(est_data[[sp_pars[j]]], probs=0.975))
)
temp_species_data <- temp_species_data %>%
mutate(ci_contain_true =
((par_true_val > par_ci_lower) &&
(par_true_val < par_ci_upper)))
species_data <- rbind(species_data, temp_species_data)
}

#Extract sim-level info
sim_data_temp <- tibble(batch = i,
start_time = model_save$model_start_time,
end_time = model_save$model_end_time) %>%
mutate(runtime = difftime(end_time, start_time, units = "mins"))
sim_data <- rbind(sim_data, sim_data_temp)
}
}
}

save_data <- list(
error_reduction_data = error_reduction_data,
ind_data = ind_data,
species_data = species_data,
sim_data = sim_data,
error_par_data = error_par_data
)

saveRDS(save_data, file= paste0("output/data/", error_type, "_summary_data.rds"))
}

#-----------------------------------------------------------------------------#
# Start here if data already processed
#-----------------------------------------------------------------------------#
#load in saved data
save_data <- readRDS(file= paste0("output/data/", error_type, "_summary_data.rds"))
#Initialise data structures
error_reduction_data <- save_data$error_reduction_data
ind_data <- save_data$ind_data
species_data <- save_data$species_data
sim_data <- save_data$sim_data
sim_data_filtered <- sim_data %>%
filter(runtime > 0)

mean(sim_data_filtered$runtime)
hist(error_reduction_data$size_RMSE_percent_reduction, main = "", col = "lightblue",
xlab = "RMSE % reduction for size", breaks = 20)
hist(error_reduction_data$growth_RMSE_percent_reduction, main = "", col = "lightgreen",
xlab = "RMSE % reduction for growth", breaks = 20)

mean(error_par_data$sigma_e_mean) #Mean estimate
error_par_data <- error_par_data %>% #Construct CIs for N(0,0.1)
mutate(true_geq_lower = (0.1 >= sigma_e_CI_lower),
true_leq_upper = (0.1 <= sigma_e_CI_upper))
error_par_data$true_in_CI <- error_par_data$true_geq_lower & error_par_data$true_leq_upper
sum(error_par_data$true_in_CI)/ nrow(error_par_data)

#Extract information on how much error in size and growth was reduced
error_reduction_summary <- error_reduction_data %>%
summarise(meanSizeRed = mean(size_RMSE_percent_reduction),
meanGrowthRed = mean(growth_RMSE_percent_reduction),
meanSizeRMSEobs = mean(obs_size_RMSE),
meanSizeRMSEfit =mean(ind_size_RMSE),
meanGrowthRMSEobs = mean(obs_growth_RMSE),
meanGrowthRMSEfit = mean(ind_growth_RMSE))

ind_data_summary <- ind_data %>%
group_by(par_name) %>%
summarise(CIs_contain_true = sum(ind_ci_contain_true),
mean_CI_width = mean(ci_upper - ci_lower),
RMSE = sqrt(sum((true_val - est)^2)/nrow(sim_data))) %>%
mutate(CI_percent = (CIs_contain_true/(50*nrow(sim_data_filtered))) * 100)

#plots for individual data
ind_par_names <- c("ind_max_growth", "ind_diameter_at_max_growth", "ind_K")
ind_par_names_fancy <- c("g_max", "S_max", "k")
plots <- list()
for(i in 1:3){
temp <- ind_data %>%
filter(par_name == ind_par_names[i]) %>%
mutate(CI_width = ci_upper - ci_lower,
error = est - true_val)

plots[[i]] <- ggplot(data=temp, aes(x = true_val,y = error))+
geom_point(size=2, colour = "green4", fill="#1EB300", alpha = 0.1) +
labs(x = paste("True Value: ", ind_par_names_fancy[i], sep=""),
y = "Error (est - true)") +
geom_abline(intercept=0, slope=0, linetype="dashed", linewidth = 1, alpha=0.5) +
scale_x_log10() +
#scale_y_log10() +
theme_classic()
}
ind_par_plot_grid <- plot_grid(plots[[1]], plots[[2]], plots[[3]],
align = "h", nrow = 1, labels = c("(d)", "(e)", "(f)"))

#Extract info on species-level parameter performance
species_data_summary <- species_data %>%
group_by(par_name) %>%
summarise(
true_val = mean(par_true_val),
mean_est = mean(par_est),
per_contain_true = (sum(ci_contain_true)/nrow(sim_data))*100,
mean_ci_width = mean(par_ci_upper - par_ci_lower),
RMSE = sqrt(sum((par_true_val - par_est)^2)/nrow(sim_data))
)

#plots for species-level
species_data <- species_data %>%
group_by(par_name) %>%
mutate(
Fancy_name = case_when(
par_name == "species_max_growth_mean" ~ "ln(g_max) Mean",
par_name == "species_max_growth_sd" ~ "ln(g_max) SD",
par_name == "species_diameter_at_max_growth_mean" ~ "ln(S_max) Mean",
par_name == "species_diameter_at_max_growth_sd" ~ "ln(S_max) SD",
par_name == "species_K_mean" ~ "ln(k) Mean",
par_name == "species_K_sd" ~ "ln(k) SD",
.default = "Whoops"
)) %>%
mutate(error = par_est - par_true_val) %>%
ungroup()


#create density ridge plot
ggplot(species_data, aes(x = error, y = Fancy_name)) +
geom_density_ridges(aes(fill=Fancy_name), alpha = 0.8) +
geom_vline(xintercept=0, linetype = "dashed", color = "black", linewidth = 1) +
ylab("Parameter") +
xlab("Error (est - true)") +
theme_classic() +
theme(legend.position = "none")


#------------------------------------------------------------------------------#
#Analysis of least growth individual
growth_function <- function(x, pars){
growth <- pars[[1]] *
exp(-0.5 * (log(x / pars[[2]]) / pars[[3]])^2 )
return(growth)
}

least_growth_ind <- ind_data %>%
filter(treeid == 35)

hist_data1 <- least_growth_ind %>%
filter(par_name == "ind_max_growth") %>%
mutate(err = est - true_val)
hist_gmax <- ggplot(data = hist_data1, aes(err)) +
geom_histogram(fill = "#72b000",
color = "black") +
geom_vline(xintercept = 0, linewidth = 1, linetype = "dashed") +
labs(x = "Max growth rate g_max error",
y = "Count") +
theme_classic()

hist_data2 <- least_growth_ind %>%
filter(par_name == "ind_diameter_at_max_growth") %>%
mutate(err = est - true_val)
hist_Smax <- ggplot(data = hist_data2, aes(err)) +
geom_histogram(fill = "#72b000",
color = "black") +
geom_vline(xintercept = 0, linewidth = 1, linetype = "dashed") +
labs(x = "DBH at max growth rate S_max error",
y = "Count") +
theme_classic()

hist_data3 <- least_growth_ind %>%
filter(par_name == "ind_K") %>%
mutate(err = est - true_val)
hist_k <- ggplot(data = hist_data3, aes(err)) +
geom_histogram(fill = "#72b000",
color = "black") +
geom_vline(xintercept = 0, linewidth = 1, linetype = "dashed") +
labs(x = "Spread K error",
y = "Count") +
theme_classic()

rstan_data <- readRDS("data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds")
ind_35_data <- rstan_data$sim_data %>%
filter(treeid == 35)
ind_35_pars <- list(pars = c(ind_35_data$g_max[1],
ind_35_data$s_max[1],
ind_35_data$k[1]))

S_0 <- ind_35_data$S_0[1]
S_final <- ind_35_data$S_final[1]

ind35_growth <- ggplot() +
geom_function(fun=growth_function, args=ind_35_pars, alpha=0.5,
color="#72b000", linewidth=1, xlim=c(1, max(rstan_data$sim_data$S_true)),
linetype = "solid") +
geom_function(fun=growth_function, args=ind_35_pars, alpha=1,
color="#72b000", linewidth=2.5, xlim=c(S_0, S_final),
linetype = "solid") +
labs(x = "Size (cm)", y = "Growth rate (cm/yr)") +
theme_classic()

hist_grid <- plot_grid(hist_gmax, hist_Smax, hist_k, align = "hv", ncol = 1)
least_growth_plot <- plot_grid(ind35_growth, hist_grid, align = "h", nrow = 1,
rel_widths = c(0.6, 0.4))

#Take sample of posterior parameter estimates to plot growth functions
sim_ids <- sample(sim_data_filtered$batch, size=20)

ind35_growth_fns <- ggplot()
for(i in 1:20){
ests <- least_growth_ind %>%
filter(batch == sim_ids[i])
temp_pars <- list(pars = c(ests$est[which(ests$par_name == "ind_max_growth")],
ests$est[which(ests$par_name == "ind_diameter_at_max_growth")],
ests$est[which(ests$par_name == "ind_K")]))

ind35_growth_fns <- ind35_growth_fns +
geom_function(fun=growth_function, args=temp_pars, alpha=0.2,
color="#000000", linewidth=0.5, xlim=c(1, max(rstan_data$sim_data$S_true)),
linetype = "solid")
}

ind35_growth_fns <- ind35_growth_fns +
geom_function(fun=growth_function, args=ind_35_pars, alpha=0.5,
color="#72b000", linewidth=1, xlim=c(1, max(rstan_data$sim_data$S_true)),
linetype = "solid") +
geom_function(fun=growth_function, args=ind_35_pars, alpha=1,
color="#72b000", linewidth=2.5, xlim=c(S_0, S_final),
linetype = "solid") +
labs(x = "Size (cm)", y = "Growth rate (cm/yr)") +
theme_classic()

ind35_growth_fns_grid <- plot_grid(ind35_growth_fns)

least_growth_plot_fns <- plot_grid(ind35_growth_fns_grid, hist_grid, align = "h", nrow = 1,
rel_widths = c(0.6, 0.4))
Binary file added data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds
Binary file not shown.

0 comments on commit 5a7666f

Please sign in to comment.