-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added simulation analysis code and simulated individual data.
- Loading branch information
1 parent
ce331e1
commit 5a7666f
Showing
2 changed files
with
399 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,399 @@ | ||
# Code to extract results from simulations and corresponding model fits. | ||
# Assumes that the saved fits are stored in a directory called input | ||
# with specific naming structure. | ||
|
||
#Load in model fits, extract info, export to data structures | ||
{ | ||
library(tidyverse) | ||
library(lubridate) | ||
library(rstan) | ||
library(grid) | ||
library(ggridges) | ||
library(cowplot) | ||
} | ||
|
||
structure <- "25Yr_ObsPeriod_6Obs" | ||
error_type <- "Ruger" #"Norm" | ||
batches <- 1100 | ||
|
||
#simulated data available | ||
sim_data_list <- readRDS("data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds") | ||
|
||
sp_pars <- c("species_max_growth_mean", | ||
"species_max_growth_sd", | ||
"species_diameter_at_max_growth_mean", | ||
"species_diameter_at_max_growth_sd", | ||
"species_K_mean", | ||
"species_K_sd") | ||
sp_pars_true_vals <- c(mean(log(sim_data_list$sim_ind_info$g_max)), | ||
sd(log(sim_data_list$sim_ind_info$g_max)), | ||
mean(log(sim_data_list$sim_ind_info$s_max)), | ||
sd(log(sim_data_list$sim_ind_info$s_max)), | ||
mean(log(sim_data_list$sim_ind_info$k)), | ||
sd(log(sim_data_list$sim_ind_info$k))) | ||
ind_par_names <- c("ind_max_growth", "ind_diameter_at_max_growth", "ind_K") | ||
growth_function_pars <- c("g_max", "s_max", "k") | ||
|
||
#Diagnostics | ||
diag_data <- read.csv(paste0("input/",error_type,"Diag.csv")) | ||
diag_data_reduced <- diag_data %>% | ||
filter(Rhat > 1.05, DivTrans >50) | ||
|
||
#Exclusion lists come from the diag_data_reduced frame with double checking | ||
#exclude <- c(192, 227, 85, 888, 910) #Norm | ||
#exclude <- c(1013, 523, 655, 657) #Ruger | ||
|
||
load_data <- FALSE | ||
if(load_data){ | ||
#Initialise data structures | ||
error_reduction_data <- tibble() | ||
ind_data <- tibble() | ||
species_data <- tibble() | ||
error_par_data <- tibble() | ||
sim_data <- tibble() | ||
|
||
for(i in 1:batches){ #Iterate through simulations | ||
if((i-1) %% 25 == 0){ | ||
print(paste0("Model: ", i)) | ||
} | ||
|
||
filename <- paste0("input/fits/Fit_", error_type, "_", i, "_1_25Yr_ObsPeriod_6Obs_FullModel.rds") | ||
#Load model if it exists, otherwise create dummy row | ||
if(!file.exists(filename) || (i %in%exclude)){ | ||
temp_sim_data <- tibble( | ||
batch = i, | ||
start_time = 0, | ||
end_time = 0, | ||
runtime = -1 | ||
) | ||
|
||
sim_data <- rbind(sim_data, temp_sim_data) | ||
} else { | ||
model_save <- readRDS(filename) | ||
sim_data_list <- readRDS("data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds") | ||
|
||
if(length(model_save$error_save_data)>0){ | ||
sim_data_list$sim_data$S_obs <- model_save$error_save_data$S_obs | ||
sim_data_list$sim_data <- sim_data_list$sim_data %>% | ||
group_by(treeid) %>% | ||
arrange(time) %>% | ||
mutate(delta_S_obs = lead(S_obs) - S_obs) %>% | ||
ungroup() %>% | ||
arrange(treeid, time) | ||
|
||
#Extract samples | ||
est_data <- rstan::extract(model_save$fit, permuted=TRUE) | ||
|
||
#Extract error parameter data | ||
error_par_temp <- tibble( | ||
batch = i, | ||
sigma_e_mean = mean(est_data$global_error_sigma), | ||
sigma_e_median = median(est_data$global_error_sigma), | ||
sigma_e_CI_lower = quantile(est_data$global_error_sigma, 0.025), | ||
sigma_e_CI_upper = quantile(est_data$global_error_sigma, 0.975) | ||
) | ||
error_par_data <- rbind(error_par_data, error_par_temp) | ||
|
||
#Extract size and growth measurement error reduction | ||
size_data <- tibble( | ||
true_size = sim_data_list$sim_data$S_true, | ||
obs_size = sim_data_list$sim_data$S_obs, | ||
ind_fit = apply(est_data$S_hat, 2, mean) | ||
) | ||
|
||
size_data <- size_data %>% | ||
mutate( | ||
obs_diff = obs_size - true_size, | ||
ind_diff = ind_fit - true_size, | ||
) | ||
|
||
growth_data <- tibble( | ||
true_growth = sim_data_list$sim_data$delta_S, | ||
obs_growth = sim_data_list$sim_data$delta_S_obs, | ||
ind_fit = apply(est_data$G_hat, 2, mean) | ||
) | ||
|
||
growth_data <- growth_data %>% | ||
mutate( | ||
obs_diff = obs_growth - true_growth, | ||
ind_diff = ind_fit - true_growth | ||
) %>% | ||
drop_na() | ||
|
||
temp_error_reduction_data <- tibble( | ||
batch = i, | ||
error_type = error_type, | ||
obs_size_RMSE = sqrt(sum(size_data$obs_diff^2)/nrow(size_data)), | ||
ind_size_RMSE = sqrt(sum(size_data$ind_diff^2)/nrow(size_data)), | ||
obs_growth_RMSE = sqrt(sum(growth_data$obs_diff^2)/nrow(growth_data)), | ||
ind_growth_RMSE = sqrt(sum(growth_data$ind_diff^2)/nrow(growth_data)), | ||
size_RMSE_percent_reduction = (1-(ind_size_RMSE/obs_size_RMSE))*100, | ||
growth_RMSE_percent_reduction = (1-(ind_growth_RMSE/obs_growth_RMSE))*100 | ||
) | ||
|
||
error_reduction_data <- rbind(error_reduction_data, temp_error_reduction_data) | ||
|
||
#Extract individual parameter estimates | ||
for(j in 1:length(ind_par_names)){ | ||
#Get percentage of ind_par CIs that contains true value, par estimate | ||
ind_CI_contain_true <- c() | ||
for(k in 1:50){ | ||
CI_lower <- as.numeric(quantile(est_data[[ind_par_names[j]]][,k], probs=0.025)) | ||
CI_upper <- as.numeric(quantile(est_data[[ind_par_names[j]]][,k], probs=0.975)) | ||
true_val <- sim_data_list$sim_ind_info[[growth_function_pars[j]]][k] | ||
|
||
ind_CI_contain_true[k] <- (true_val >= CI_lower) && (true_val <= CI_upper) | ||
|
||
ind_par_temp <- tibble(batch = i, | ||
treeid = k, | ||
par_name = ind_par_names[j], | ||
true_val = true_val, | ||
est = mean(est_data[[ind_par_names[j]]][,k]), | ||
ci_lower = CI_lower, | ||
ci_upper = CI_upper) | ||
ind_par_temp$ind_ci_contain_true <- ind_CI_contain_true[k] | ||
ind_par_temp$error <- ind_par_temp$est - ind_par_temp$true_val | ||
ind_data <- rbind(ind_data, ind_par_temp) | ||
} | ||
} | ||
|
||
#Extract species-level estimates | ||
for(j in 1:length(sp_pars)){ | ||
temp_species_data <- tibble( | ||
batch = i, | ||
error_type = error_type, | ||
par_name = sp_pars[j], | ||
par_true_val = sp_pars_true_vals[j], | ||
par_est = mean(est_data[[sp_pars[j]]]), | ||
par_ci_lower = as.numeric(quantile(est_data[[sp_pars[j]]], probs=0.025)), | ||
par_ci_upper = as.numeric(quantile(est_data[[sp_pars[j]]], probs=0.975)) | ||
) | ||
temp_species_data <- temp_species_data %>% | ||
mutate(ci_contain_true = | ||
((par_true_val > par_ci_lower) && | ||
(par_true_val < par_ci_upper))) | ||
species_data <- rbind(species_data, temp_species_data) | ||
} | ||
|
||
#Extract sim-level info | ||
sim_data_temp <- tibble(batch = i, | ||
start_time = model_save$model_start_time, | ||
end_time = model_save$model_end_time) %>% | ||
mutate(runtime = difftime(end_time, start_time, units = "mins")) | ||
sim_data <- rbind(sim_data, sim_data_temp) | ||
} | ||
} | ||
} | ||
|
||
save_data <- list( | ||
error_reduction_data = error_reduction_data, | ||
ind_data = ind_data, | ||
species_data = species_data, | ||
sim_data = sim_data, | ||
error_par_data = error_par_data | ||
) | ||
|
||
saveRDS(save_data, file= paste0("output/data/", error_type, "_summary_data.rds")) | ||
} | ||
|
||
#-----------------------------------------------------------------------------# | ||
# Start here if data already processed | ||
#-----------------------------------------------------------------------------# | ||
#load in saved data | ||
save_data <- readRDS(file= paste0("output/data/", error_type, "_summary_data.rds")) | ||
#Initialise data structures | ||
error_reduction_data <- save_data$error_reduction_data | ||
ind_data <- save_data$ind_data | ||
species_data <- save_data$species_data | ||
sim_data <- save_data$sim_data | ||
sim_data_filtered <- sim_data %>% | ||
filter(runtime > 0) | ||
|
||
mean(sim_data_filtered$runtime) | ||
hist(error_reduction_data$size_RMSE_percent_reduction, main = "", col = "lightblue", | ||
xlab = "RMSE % reduction for size", breaks = 20) | ||
hist(error_reduction_data$growth_RMSE_percent_reduction, main = "", col = "lightgreen", | ||
xlab = "RMSE % reduction for growth", breaks = 20) | ||
|
||
mean(error_par_data$sigma_e_mean) #Mean estimate | ||
error_par_data <- error_par_data %>% #Construct CIs for N(0,0.1) | ||
mutate(true_geq_lower = (0.1 >= sigma_e_CI_lower), | ||
true_leq_upper = (0.1 <= sigma_e_CI_upper)) | ||
error_par_data$true_in_CI <- error_par_data$true_geq_lower & error_par_data$true_leq_upper | ||
sum(error_par_data$true_in_CI)/ nrow(error_par_data) | ||
|
||
#Extract information on how much error in size and growth was reduced | ||
error_reduction_summary <- error_reduction_data %>% | ||
summarise(meanSizeRed = mean(size_RMSE_percent_reduction), | ||
meanGrowthRed = mean(growth_RMSE_percent_reduction), | ||
meanSizeRMSEobs = mean(obs_size_RMSE), | ||
meanSizeRMSEfit =mean(ind_size_RMSE), | ||
meanGrowthRMSEobs = mean(obs_growth_RMSE), | ||
meanGrowthRMSEfit = mean(ind_growth_RMSE)) | ||
|
||
ind_data_summary <- ind_data %>% | ||
group_by(par_name) %>% | ||
summarise(CIs_contain_true = sum(ind_ci_contain_true), | ||
mean_CI_width = mean(ci_upper - ci_lower), | ||
RMSE = sqrt(sum((true_val - est)^2)/nrow(sim_data))) %>% | ||
mutate(CI_percent = (CIs_contain_true/(50*nrow(sim_data_filtered))) * 100) | ||
|
||
#plots for individual data | ||
ind_par_names <- c("ind_max_growth", "ind_diameter_at_max_growth", "ind_K") | ||
ind_par_names_fancy <- c("g_max", "S_max", "k") | ||
plots <- list() | ||
for(i in 1:3){ | ||
temp <- ind_data %>% | ||
filter(par_name == ind_par_names[i]) %>% | ||
mutate(CI_width = ci_upper - ci_lower, | ||
error = est - true_val) | ||
|
||
plots[[i]] <- ggplot(data=temp, aes(x = true_val,y = error))+ | ||
geom_point(size=2, colour = "green4", fill="#1EB300", alpha = 0.1) + | ||
labs(x = paste("True Value: ", ind_par_names_fancy[i], sep=""), | ||
y = "Error (est - true)") + | ||
geom_abline(intercept=0, slope=0, linetype="dashed", linewidth = 1, alpha=0.5) + | ||
scale_x_log10() + | ||
#scale_y_log10() + | ||
theme_classic() | ||
} | ||
ind_par_plot_grid <- plot_grid(plots[[1]], plots[[2]], plots[[3]], | ||
align = "h", nrow = 1, labels = c("(d)", "(e)", "(f)")) | ||
|
||
#Extract info on species-level parameter performance | ||
species_data_summary <- species_data %>% | ||
group_by(par_name) %>% | ||
summarise( | ||
true_val = mean(par_true_val), | ||
mean_est = mean(par_est), | ||
per_contain_true = (sum(ci_contain_true)/nrow(sim_data))*100, | ||
mean_ci_width = mean(par_ci_upper - par_ci_lower), | ||
RMSE = sqrt(sum((par_true_val - par_est)^2)/nrow(sim_data)) | ||
) | ||
|
||
#plots for species-level | ||
species_data <- species_data %>% | ||
group_by(par_name) %>% | ||
mutate( | ||
Fancy_name = case_when( | ||
par_name == "species_max_growth_mean" ~ "ln(g_max) Mean", | ||
par_name == "species_max_growth_sd" ~ "ln(g_max) SD", | ||
par_name == "species_diameter_at_max_growth_mean" ~ "ln(S_max) Mean", | ||
par_name == "species_diameter_at_max_growth_sd" ~ "ln(S_max) SD", | ||
par_name == "species_K_mean" ~ "ln(k) Mean", | ||
par_name == "species_K_sd" ~ "ln(k) SD", | ||
.default = "Whoops" | ||
)) %>% | ||
mutate(error = par_est - par_true_val) %>% | ||
ungroup() | ||
|
||
|
||
#create density ridge plot | ||
ggplot(species_data, aes(x = error, y = Fancy_name)) + | ||
geom_density_ridges(aes(fill=Fancy_name), alpha = 0.8) + | ||
geom_vline(xintercept=0, linetype = "dashed", color = "black", linewidth = 1) + | ||
ylab("Parameter") + | ||
xlab("Error (est - true)") + | ||
theme_classic() + | ||
theme(legend.position = "none") | ||
|
||
|
||
#------------------------------------------------------------------------------# | ||
#Analysis of least growth individual | ||
growth_function <- function(x, pars){ | ||
growth <- pars[[1]] * | ||
exp(-0.5 * (log(x / pars[[2]]) / pars[[3]])^2 ) | ||
return(growth) | ||
} | ||
|
||
least_growth_ind <- ind_data %>% | ||
filter(treeid == 35) | ||
|
||
hist_data1 <- least_growth_ind %>% | ||
filter(par_name == "ind_max_growth") %>% | ||
mutate(err = est - true_val) | ||
hist_gmax <- ggplot(data = hist_data1, aes(err)) + | ||
geom_histogram(fill = "#72b000", | ||
color = "black") + | ||
geom_vline(xintercept = 0, linewidth = 1, linetype = "dashed") + | ||
labs(x = "Max growth rate g_max error", | ||
y = "Count") + | ||
theme_classic() | ||
|
||
hist_data2 <- least_growth_ind %>% | ||
filter(par_name == "ind_diameter_at_max_growth") %>% | ||
mutate(err = est - true_val) | ||
hist_Smax <- ggplot(data = hist_data2, aes(err)) + | ||
geom_histogram(fill = "#72b000", | ||
color = "black") + | ||
geom_vline(xintercept = 0, linewidth = 1, linetype = "dashed") + | ||
labs(x = "DBH at max growth rate S_max error", | ||
y = "Count") + | ||
theme_classic() | ||
|
||
hist_data3 <- least_growth_ind %>% | ||
filter(par_name == "ind_K") %>% | ||
mutate(err = est - true_val) | ||
hist_k <- ggplot(data = hist_data3, aes(err)) + | ||
geom_histogram(fill = "#72b000", | ||
color = "black") + | ||
geom_vline(xintercept = 0, linewidth = 1, linetype = "dashed") + | ||
labs(x = "Spread K error", | ||
y = "Count") + | ||
theme_classic() | ||
|
||
rstan_data <- readRDS("data/50ind_25Yr_ObsPeriod_6Obs_CanhamSim.rds") | ||
ind_35_data <- rstan_data$sim_data %>% | ||
filter(treeid == 35) | ||
ind_35_pars <- list(pars = c(ind_35_data$g_max[1], | ||
ind_35_data$s_max[1], | ||
ind_35_data$k[1])) | ||
|
||
S_0 <- ind_35_data$S_0[1] | ||
S_final <- ind_35_data$S_final[1] | ||
|
||
ind35_growth <- ggplot() + | ||
geom_function(fun=growth_function, args=ind_35_pars, alpha=0.5, | ||
color="#72b000", linewidth=1, xlim=c(1, max(rstan_data$sim_data$S_true)), | ||
linetype = "solid") + | ||
geom_function(fun=growth_function, args=ind_35_pars, alpha=1, | ||
color="#72b000", linewidth=2.5, xlim=c(S_0, S_final), | ||
linetype = "solid") + | ||
labs(x = "Size (cm)", y = "Growth rate (cm/yr)") + | ||
theme_classic() | ||
|
||
hist_grid <- plot_grid(hist_gmax, hist_Smax, hist_k, align = "hv", ncol = 1) | ||
least_growth_plot <- plot_grid(ind35_growth, hist_grid, align = "h", nrow = 1, | ||
rel_widths = c(0.6, 0.4)) | ||
|
||
#Take sample of posterior parameter estimates to plot growth functions | ||
sim_ids <- sample(sim_data_filtered$batch, size=20) | ||
|
||
ind35_growth_fns <- ggplot() | ||
for(i in 1:20){ | ||
ests <- least_growth_ind %>% | ||
filter(batch == sim_ids[i]) | ||
temp_pars <- list(pars = c(ests$est[which(ests$par_name == "ind_max_growth")], | ||
ests$est[which(ests$par_name == "ind_diameter_at_max_growth")], | ||
ests$est[which(ests$par_name == "ind_K")])) | ||
|
||
ind35_growth_fns <- ind35_growth_fns + | ||
geom_function(fun=growth_function, args=temp_pars, alpha=0.2, | ||
color="#000000", linewidth=0.5, xlim=c(1, max(rstan_data$sim_data$S_true)), | ||
linetype = "solid") | ||
} | ||
|
||
ind35_growth_fns <- ind35_growth_fns + | ||
geom_function(fun=growth_function, args=ind_35_pars, alpha=0.5, | ||
color="#72b000", linewidth=1, xlim=c(1, max(rstan_data$sim_data$S_true)), | ||
linetype = "solid") + | ||
geom_function(fun=growth_function, args=ind_35_pars, alpha=1, | ||
color="#72b000", linewidth=2.5, xlim=c(S_0, S_final), | ||
linetype = "solid") + | ||
labs(x = "Size (cm)", y = "Growth rate (cm/yr)") + | ||
theme_classic() | ||
|
||
ind35_growth_fns_grid <- plot_grid(ind35_growth_fns) | ||
|
||
least_growth_plot_fns <- plot_grid(ind35_growth_fns_grid, hist_grid, align = "h", nrow = 1, | ||
rel_widths = c(0.6, 0.4)) |
Binary file not shown.