-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.rmd
995 lines (818 loc) · 35.8 KB
/
Main.rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
---
output: html_document
editor_options:
chunk_output_type: console
---
# Demonstration code
In this code we walk through the analysis for [preprint], building a method for fitting a hierarchical Bayesian longitudinal model to repeat measurement data.
In this analysis we will build simulated datasets of tree size with various growth functions, then fit models to that data. We will also demonstrate the method applied to a real-world dataset. In the course of the investigation we consider the behaviour of different sample sizes, growth functions, and numerical integration methods.
The data files used in the paper are available as a release associated with the repo. We have included the data we use from BCI in the repo itself, which is sampled from a bigger dataset. Details for where to access the raw data is available in the relevant section.
Build file structures for output.
```{r}
if (!file.exists("output")){
dir.create("output")
dir.create("output/data")
dir.create("output/figures")
dir.create("output/figures/diagnostic")
dir.create("output/figures/sampled")
dir.create("output/figures/RungeKuttaDemo")
}
```
Load libraries:
```{r}
library(rstan)
library(rstantools)
library(tidyverse)
library(vioplot)
library(ggbeeswarm)
library(parallel)
library(latex2exp)
library(grid)
library(svglite)
library(mnormt)
```
Now we are ready to start doing things.
First we set the file locations for output and initialise data structures.
```{r}
rstan_data_locations <- list(
single_species_data = "output/data/SingleSpecies_RStan",
single_individual_data = "output/data/SingleInd_RStan"
)
data_build_models <- list( #Swap out as required
exp_model = list(
model_spec = "R/ExponentialSpec.r",
model_name = "Exponential",
model_file = "Exp"
)
)
data_build_storage_list <- list(
canham_model = list(
model_spec = "R/CanhamSpec.r",
model_name = "Canham",
model_file = "Canham"
),
const_model = list(
model_spec = "R/ConstSpec.r",
model_name = "Constant",
model_file = "Const"
),
exp_model = list(
model_spec = "R/ExponentialSpec.r",
model_name = "Exponential",
model_file = "Exp"
)
)
sim_pars <- list(N_ind = 20, #Parameters controlling simulation
S0_mean = 2,
S0_sd = 1,
step_size = 0.1,
N_obs = 5, #Observations per individual
census_interval = 5,
model = "exp")
sim_pars$time <- seq(from=0, to=(sim_pars$N_obs*sim_pars$census_interval -1),
by=sim_pars$census_interval)
#Control list to be passed to data construction
data_construction_controls <- list(
data_build_models = data_build_models,
sim_pars = sim_pars,
build_data_single_individual = TRUE,
error_type = "Norm", #Norm for N(0, 0.5) error, Ruger for more complex model
#Specify where data is to be saved and loaded from.
rstan_data_locations = rstan_data_locations
)
#Change to TRUE to generate data
build_datasets <- TRUE
```
We are ready to build the first multi-individual data sets. These are produced with error added, in a later section we will look at observations without error to investigate integration methods. Datasets built with the Canham function will include a step size parameter which is used to control sub-steps of the Runge-Kutta 4th order integration.
```{r}
if(build_datasets){
source("R/BuildData.r", local=TRUE)
build_data(data_construction_controls)
}
```
Now we are going to build data sets for the different sample sizes using the constant growth model. Building the largest dataset (10,000 individuals) can take a few minutes.
```{r}
data_build_models_sample_size <- list(
const_model = list(
model_spec = "R/ConstSpec.r",
model_name = "Constant",
model_file = "Const"
)
)
sim_pars <- list(N_ind = 10, #Parameters controlling simulation
S0_mean = 1.5,
S0_sd = 1,
step_size = 0.1,
N_obs = 5, #Observations per individual
census_interval = 5,
model = "const")
sim_pars$time <- seq(from=0, to=(sim_pars$N_obs*sim_pars$census_interval -1),
by=sim_pars$census_interval)
data_construction_controls <- list(
data_build_models = data_build_models_sample_size,
sim_pars = sim_pars,
build_data_single_individual = FALSE,
#Specify where data is to be saved and loaded from.
rstan_data_locations = rstan_data_locations,
error_type="Norm"
)
build_datasets <- TRUE
for(i in c(10, 100, 1000, 10000)){
data_construction_controls$sim_pars$N_ind <- i
data_construction_controls$rstan_data_locations$single_species_data <-
paste("output/data/", i, "_" , "SingleSpecies_RStan", sep="")
if(build_datasets){ #10 indviduals first
source("R/BuildData.r", local=TRUE)
build_data(data_construction_controls)
}
}
```
# Single time series demonstration
Here we will demonstrate the effect of each integration method (Euler, midpoint, Runge-Kutta 4th order) on the size and growth parmeter estimates from a single individual's growth history, projected forward with an exponential growth function.
Load in functions.
```{r}
source("R/RungeKuttaFunctions.r")
```
Set parameters and initial condition
```{r}
beta <- 1.0
S_0 <- 1.0
step_size <- 1.0
n_step <- 5
time <- c(0, 1, 2, 3, 4)
#Vectors of data
est_names <- c("Euler", "Midpoint", "RK4")
colours <- c("blue", "red", "green4", "black") #Default for non-error models
```
Numerical integration without MCMC or error, which requires knowing the value of beta.
```{r}
growth <- function(size, pars){
return(pars[1]*size)
}
#Euler method
euler_int <- euler_est(S_0, growth, pars=c(beta), step_size, nstep)
#Midpoint method
midpoint_int <- midpoint_est(S_0, growth, pars=c(beta), step_size, nstep)
#Runge-Kutta 4th order
runge_kutta_int <- rk4_est(S_0, growth, pars=c(beta), step_size, nstep)
```
Plot the numerical integration projections.
```{r}
lines_list <- list(euler_int, midpoint_int, runge_kutta_int)
points_list <- lines_list
names <- c(est_names, "Function")
lty_list <- c(1,1,1,1)
pch_list <- c(3,4,5,NA)
title <- "Numerical Integration"
#Produce plot
plot_estimates(lines_list, points_list, names, colours,
pch_list, lty_list, title, lwd=2)
#Save output to file
file_name <- "output/figures/RungeKuttaDemo/Single_NumericalIntegration.svg"
save_plot_of_estimates(lines_list, points_list, names, colours, pch_list,
lty_list, title, file_name, lwd=2)
```
In this instance we will use the analytic solution to construct the data rather than using a forward projection with a numerical method as we did above.
Build base observation data
```{r}
N_obs <- n_step
census <- c(1, 2, 3, 4, 5)
S_obs <- S_0 * exp(beta*time)
census_interval <- rep(step_size, times=5) #Step size
rstan_data <- build_rstan_data(N_obs, S_obs, census, census_interval)
```
Now we construct a data set with measurement error.
```{r}
S_obs_err <- c()
for(i in 1:length(S_obs)){
S_obs_err[i] <- add_error(S_obs[i], "Norm") #Implementing error model
}
#Alternate implementation
S_obs_err <- S_obs + rnorm(length(S_obs), 0, 0.5)
rstan_data_error <- build_rstan_data(N_obs, S_obs_err, census, census_interval)
```
From here we run the Bayesian models
```{r}
model_outputs_single <- list()
```
MCMC estimation from analytic solution to demonstrate the different integration methods. As this will run chains in sequence it may take a few minutes. Output plots can be seen in output/figures/RungeKuttaDemo/.
```{r}
model_outputs_single$MCMC <- build_output(rstan_file = "stan/Exp_SingleIndividual.stan",
rstan_data = rstan_data,
est_names,
est_method = "samp",
int_methods = c(1,2,3),
title = "MCMC Estimation",
file_name = "output/figures/RungeKuttaDemo/Single_MCMCIntegration.svg",
names = c(est_names, "Function"),
colours,
lty_list <- c(1,1,1,1),
pch_list <- c(3,4,5,NA),
estplot = TRUE)
```
MCMC estimation with measurement error.
```{r}
model_outputs_single$MCMC_Error <- build_output(rstan_file = "stan/Exp_SingleIndividual.stan",
rstan_data = rstan_data_error,
est_names,
est_method = "samp",
int_methods = c(1,2,3),
title = "MCMC Estimation With Error",
file_name = "output/figures/RungeKuttaDemo/Single_MCMCIntegration_Error.svg",
names = c(est_names, "Function"),
colours,
lty_list <- c(1,1,1,1),
pch_list <- c(3,4,5,NA),
estplot = TRUE)
```
To investigate the numerical integration behaviour fully we repeat the error simulation and model fitting.
Warning: The following block will take a long time to run! You can reduce the number of iterations required using the n_iter argument.
```{r}
n_iter = 100
size_2_est <- tibble()
beta_est <- tibble()
model <- stan_model(file="stan/Exp_SingleIndividual.stan")
for(i in 1:n_iter){
#Add error to analytic solution
S_obs_err <- abs(S_obs + rnorm(length(S_obs), 0, 0.5))
rstan_data_error <- build_rstan_data(N_obs, S_obs_err, census, census_interval)
#Iterate through the integration methods getting estimates from each one.
for(j in 1:3){
fit <- sampling(model, data=rstan_data_error[[j]],
iter=2000,
chains=4,
cores=4)
est_data <- rstan::extract(fit, permuted=TRUE)
size_temp <- tibble(int_method = j, iter = i, size_obs = S_obs[2],
size_est = apply(est_data$S_hat, 2, mean)[2])
beta_temp <- tibble(int_method = j, iter = i,
beta = mean(est_data[["ind_beta"]]))
size_2_est <- rbind(size_2_est, size_temp)
beta_est <- rbind(beta_est, beta_temp)
}
}
boxplot(beta_est$beta ~ beta_est$int_method, xlab = "Integration Method", ylab="Beta",
xaxt = "n", col=c("blue3", "red", "green4"), main="Estimates of Beta")
abline(h=1, col="grey18")
axis(1, at = c(1,2,3), labels = c("Euler", "Midpoint", "RK4"))
boxplot(size_2_est$size_est ~ size_2_est$int_method, xlab = "Integration Method", ylab="S(2)",
xaxt = "n", col=c("blue3", "red", "green4"), main="Estimates of S(2)")
abline(h=S_obs[2], col="grey18")
axis(1, at = c(1,2,3), labels = c("Euler", "Midpoint", "RK4"))
```
# BCI data demonstration
In this section we will fit a Canham growth model to a sample of individuals from Garcinia recondita in the Barro Colorado Island forest plot data:
Condit, R., S. Lao, R. Pérez, S. Aguilar, R.B. Foster, S.P. Hubbell. (2019). Complete data from the Barro Colorado 50-ha plot: 423617 trees, 35 years, v3, DataONE Dash, Dataset, https://doi.org/10.15146/5xcp-0d46.
A simple random sample without replacement was used to select 400 individuals with 6 observations. The RStan data file has been constructed and is stored as gar2in_rstan_data.rds in the data folder.
```{r}
run_model_BCI_example <- TRUE
if(run_model_BCI_example){
source("R/CanhamSpec.r", local=TRUE)
source("R/RunModels.r", local=TRUE)
dataset <- readRDS("data/gar2in_rstan_data.rds")
fit <- build_rstan_model(stan_controls = single_species_controls$stan_controls,
model_name = single_species_controls$model_name,
stan_file_name = single_species_controls$stan_file_path,
est_method = "samp",
dataset) #Index 3 used to fit into the existing functions
filename <- paste(single_species_controls$fit_save_path, "GRecondita", sep="_")
filename <- paste(filename, ".rds", sep="")
saveRDS(fit, file=filename) #Output to file
#Wrangle file names for diagnostic plots
diag_list <- single_species_controls$diagnostic_list
for(j in 1:length(diag_list)){
diag_list[[j]]$name <- paste(diag_list[[j]]$name, "GRecondita", sep="_")
}
#Plot diagnostics
plot_all_diagnostics(fit, int_method = 3,
diag_list,
inc_warmup = FALSE,
RHat_name = "GRecon")
#Further diagnostics
#Initialise data frame for Rhat diagnostics.
Rhat_data <- data.frame(row.names=names(fit@sim[["samples"]][[1]]))
#Extract values to data frame
for(i in 1:length(fit@sim[["samples"]])){
for(j in 1:length(fit@sim[["samples"]][[1]])){
Rhat_data[j,i] <- Rhat(fit@sim[["samples"]][[i]][[j]])
}
}
#Get a histogram of all the RHats
histvec <- c()
for(i in 1:ncol(Rhat_data)){ histvec <- c(histvec, Rhat_data[,i]) }
hist(histvec, col="lightblue", main="Histogram of Rhats for all chains", xlab="Rhat")
length(which(histvec > 1.05))
saveRDS(fit, file="output/data/Single_Species_Fit_Canham_GRecondita.rds")
rm(fit)
}
```
Once the model has been fit and the diagnostics do not indicate problems, we can extract the fitted values and analyse them.
```{r}
extract_analyse <- TRUE
if(extract_analyse){
#Load in files
fit <- readRDS("output/data/Single_Species_Fit_Canham_GRecondita.rds")
dataset <- readRDS("data/gar2in_rstan_data.rds")
dataset$int_method <- 3
source("R/EstimateWrangling.r", local=TRUE)
source("R/Analyse.r", local=TRUE)
source("R/CanhamSpec.r", local=TRUE)
#Sample extraction and analysis
#Building data frames
measurement_data <- tibble(treeid_factor = dataset$treeid_factor,
S_obs = dataset$S_obs,
census = dataset$census,
census_interval = dataset$census_interval)
measurement_data <- measurement_data %>%
group_by(treeid_factor) %>%
mutate(treeid = dataset$tree_id_vec[treeid_factor],
time=cumsum(census_interval)) %>%
ungroup()
individual_data <- tibble(treeid = dataset$tree_id_vec,
treeid_factor = 1:400)
#Extract samples
est_data <- rstan::extract(fit, permuted=TRUE)
#Measurement-level data
measurement_data$S_hat <- apply(est_data$S_hat, 2, mean)
measurement_data$G_hat <- apply(est_data$G_hat, 2, mean)
#Individual-level data
for(i in ind_pars){
individual_data[[ind_pars_names[which(ind_pars == i)]]] <- apply(est_data[[i]], 2, mean)
}
#Hist/boxplot combos
hist_names <- c("Max growth - g_max", "Size at max growth - S_max", "Spread - k")
for(i in 1:length(growth_par_names)){
layout(mat = matrix(c(1,2),2,1, byrow=TRUE), height = c(2,8))
par(mar=c(0, 3.1, 1.1, 2.1))
boxplot(individual_data[[growth_par_names[i]]] , horizontal=TRUE ,
xaxt="n" ,
col="palegreen3" , frame=F)
par(mar=c(4, 3.1, 1.1, 2.1))
hist(individual_data[[growth_par_names[i]]] , breaks=10 ,
col="palegreen" , main="" ,
xlab=hist_names[i])
}
#Get final size for each individual
temp <- measurement_data %>%
group_by(treeid) %>%
mutate(S_final = max(S_hat)) %>%
distinct(treeid, S_final) %>%
ungroup() %>%
select(S_final)
individual_data$S_final <- temp$S_final
#Compare individual-level parameters
pairs(~individual_data$canham_max_growth_hat +
individual_data$canham_diameter_at_max_growth_hat +
individual_data$canham_K_hat,
labels = c("g_max", "S_max", "K"))
#Large-scale plot of single pair
pairplot <- ggplot(data=individual_data, aes(x,y))+
geom_point(aes(x=canham_max_growth_hat, y=canham_K_hat),
size=2, color="green4", alpha=0.6) +
labs(x = "g_max", y = "k") +
theme_classic()
#Species-level data
species_par_est <- c()
for(i in 1:length(sp_pars)){ #Extract estimates
species_par_est[i] <- mean(est_data[[sp_pars[i]]])
}
species_data <- tibble(species_par_est, sp_pars_names, Lower=c(), Upper=c())
#Build CIs
hyperpar_CI <- build_par_CI(est_data, sp_pars, sp_pars_names)
for(i in 1:length(hyperpar_CI)){
species_data$Lower[i] <- hyperpar_CI[[i]][[1]]
species_data$Upper[i] <- hyperpar_CI[[i]][[2]]
}
#Save data to file
save_data <- list(individual_data = individual_data,
measurement_data = measurement_data,
species_data = species_data)
save_filename <- "output/data/GRecondita_Sampled_Canham_CompiledData.rds"
saveRDS(save_data, file=save_filename)
#Load if needed
save_data <- readRDS("output/data/GRecondita_Sampled_Canham_CompiledData.rds")
individual_data <- save_data$individual_data
measurement_data <- save_data$measurement_data
species_data <- save_data$species_data
#Plots of individual sizes over time, estimated and observed
n_ind <- 400 #Number of plots to do
plot_obs_and_est_life_history(measurement_data, n_ind, "Canham")
#Scatter plot of individual R^2 values against total estimated growth
r_sq_summary <- measurement_data %>%
group_by(treeid_factor) %>%
summarise(
total_est_growth = max(S_hat) - min(S_hat),
r_sq = cor(S_hat, S_obs)^2
)
file_name <- "output/figures/RSq_Growth_Scatter.png"
r_sq_scatter <- ggplot(data = r_sq_summary, aes(x=total_est_growth, y=r_sq))+
geom_point(colour = "green4", alpha=0.5) +
labs(x = "Total est. growth (cm)",
y = "R-sq: obs. and est. sizes")+
theme_classic() +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=14,face="bold"))
ggsave(file_name, plot=r_sq_scatter, width=100, height=100, units="mm")
#Number of individuals with R^2 < 0.75
sum(r_sq_summary$r_sq <0.75)
sum(r_sq_summary$r_sq <0.25)
#Maximum growth for individuals with R^2 < 0.75
max(r_sq_summary$total_est_growth[which(r_sq_summary$r_sq <0.75)])
#Total sample R^2
cor(measurement_data$S_obs, measurement_data$S_hat)^2
#Plots of sizes over time, and growth over size, for all individuals
plot_est_life_history(measurement_data, model_name="Canham")
#Plot of individual size over time based on percentiles of individual parameters
ind_pars_est <- c(g_max <- quantile(individual_data$canham_max_growth_hat, 0.5),
S_max <- quantile(individual_data$canham_diameter_at_max_growth_hat, 0.5),
k <- quantile(individual_data$canham_K_hat, 0.5))
pop_pars_est <- c(g_max_mean <- exp(species_data$species_par_est[1]),
S_max_mean <- exp(species_data$species_par_est[3]),
k_mean <- exp(species_data$species_par_est[5]))
S_0 <- rep(3, n_ind) #Start at 3cm in size to avoid getting stuck at 1cm
step_size <- 0.1 #Precision of integration method
lifespan <- 300 #How long to project forward tree size.
plot_pair_life_history(pop_pars_est, ind_pars_est, growth_function,
name = "BCIAvg",
S_0, step_size, lifespan)
#Plot growth functions for all individuals.
n_ind <- 400
growth_par_ests <- data.frame(g_max = individual_data$canham_max_growth_hat,
s_max = individual_data$canham_diameter_at_max_growth_hat,
k = individual_data$canham_K_hat)
#Plot the growth function histories starting at 3cm
plot_sample_life_history(growth_par_ests, growth_function,
name = "GRecondita_EstimatedPars",
n_ind, S_0, step_size=step_size, lifespan=lifespan)
#Plot the growth function histories starting at S_0
plot_sample_life_history(growth_par_ests, growth_function,
name = "GRecondita_From_S_0",
n_ind,
S_0 = individual_data$canham_S_0_hat,
S_final = individual_data$S_final,
step_size, lifespan = 150,
min_growth_size = min(individual_data$canham_S_0_hat),
max_growth_size = 100)
plot_sample_life_history(growth_par_ests, growth_function,
name = "GRecondita_Observed",
n_ind,
S_0 = individual_data$canham_S_0_hat,
S_final = individual_data$S_final,
step_size = step_size,
lifespan = 150)
}
```
For a direct comparison to Herault (2011) we fit a second G. recondita model. This one uses pairwise differences as point estimates of growth independent of the repeat measurement structure.
```{r}
#Build dataset from pairwise differences
source("R/RunModels.r")
dataset <- readRDS("data/gar2in_rstan_data.rds")
temp_obs <- tibble(
S_obs = dataset$S_obs,
treeid_factor = dataset$treeid_factor,
census = dataset$census,
census_interval = dataset$census_interval
) %>%
group_by(treeid_factor) %>%
mutate(delta_S_obs = lead(S_obs) - S_obs) %>%
ungroup() %>%
select(S_obs, delta_S_obs, census_interval) %>%
drop_na()
species_rstan_data <-
list(
N_obs = nrow(temp_obs),
S_obs = temp_obs$S_obs,
delta_obs = temp_obs$delta_S_obs,
census_interval = temp_obs$census_interval
)
#Build model
plot_diagnostics_species_level <- TRUE
model <- "canham"
sp_model <- stan_model(file="stan/Canham_SpeciesLevel.stan")
sp_model_start <- Sys.time()
print(paste("Fitting species-level Canham model: ", sp_model_start, sep=""))
species_only_fit <- sampling(sp_model, data=species_rstan_data,
iter=2000,
chains=4,
cores=4)
sp_model_end <- Sys.time()
print(paste("Species-level Canham model end: ",sp_model_end, sep=""))
#warnings()
if(plot_diagnostics_species_level){
plot_diagnostic_trace(species_only_fit, 3, c("species_max_growth",
"species_diameter_at_max_growth",
"species_K"),
paste("GRecondita", model, "species_model_pars", sep="_"), FALSE)
}
#Save fit
saveRDS(species_only_fit, "output/data/GRecondita_SpeciesLevelFit.rds")
rm(species_only_fit)
```
To compare the species-level model to the results from the hierarchical set-up, we plot the average trajectory of the species-level model over the individual trajectories.
```{r}
source("R/Analyse.r")
#Import individual-level parameters
individual_data <- readRDS("output/data/GRecondita_Sampled_Canham_CompiledData.rds")$individual_data
#Extract species level parameters
species_only_fit <- readRDS("output/data/GRecondita_SpeciesLevelFit.rds")
sp_est_data <- rstan::extract(species_only_fit, permuted=TRUE)
species_growth_pars <- tibble(
g_max = mean(sp_est_data$species_max_growth),
S_max = mean(sp_est_data$species_diameter_at_max_growth),
k = mean(sp_est_data$species_K),
S_0 = 1,
S_final = max(individual_data$S_final), #Get maximum size
fit_model = "Species-level model"
)
ind_growth_pars <- tibble(
g_max = individual_data$canham_max_growth_hat,
S_max = individual_data$canham_diameter_at_max_growth_hat,
k = individual_data$canham_K_hat,
S_0 = individual_data$canham_S_0_hat,
S_final = individual_data$S_final, #Get vector of final
fit_model = "Individual fits"
)
plot <- ggplot() +
xlim(1, max(individual_data$S_final)) +
xlab("Size (cm)") +
ylab("Growth rate (cm/yr)") +
theme_classic() +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=18,face="bold"))
for(i in 2:nrow(ind_growth_pars)){
args_list <- list(pars=ind_growth_pars[i,1:3])
plot <- plot +
geom_function(fun=growth_function, args=args_list, alpha=0.2,
linewidth=1, xlim=c(ind_growth_pars$S_0[i], ind_growth_pars$S_final[i]),
color = "green4", linetype = "solid")
}
plot <- plot +
geom_function(fun=growth_function,
args=list(pars=ind_growth_pars[1,1:3]),
alpha=0.2,
linewidth=1,
aes(col = ind_growth_pars$fit_model[1],
linetype = ind_growth_pars$fit_model[1]),
xlim=c(ind_growth_pars$S_0[1], ind_growth_pars$S_final[1])) +
geom_function(fun=growth_function,
args=list(pars=species_growth_pars[1,1:3]),
alpha=1,
linewidth=1.2,
aes(col = species_growth_pars$fit_model[1],
linetype = species_growth_pars$fit_model[1]),
xlim=c(0.5, max(individual_data$S_final))) +
scale_color_manual(values = c("green4", "black"),
name = "Parameter source") +
scale_linetype_manual(values = c("solid", "dashed"),
name = "Parameter source") +
theme(legend.position = "inside",
legend.position.inside = c(0.7,0.8),
legend.title = element_text(size=14),
legend.text = element_text(size=12),
legend.key.width = unit(1.2, 'cm'),
axis.text=element_text(size=12),
axis.title=element_text(size=14))
plot
```
As part of the method analysis we want to check what the out-of-sample predictive power is like, compared to the ability to predict sizes within the sample. To examine this we look at the ability of models fit to the first 5 observations with regards to their capacity to extrapolate to a 6th size that we can compare to the observed 6th size.
We are going to build 2 more models: a hierarchical model with Canham growth function fit to 5 observations per individual, and a hierarchical model with constant growth function and individual level average growth parameters fit to 5 observations. The constant growth model is directly analogous to a linear mixed effects model for sizes over time with individual effect term.
The hierarchical models fit to 5 observations will estimate the 6th size based off of the estimated 5th size and forward projection of the growth curve with the individual growth parameters and the census interval between the 5th and 6th observations. The hierarchical model fit to 6 observations directly estimates the 6th size.
```{r}
fit_5_obs <- TRUE
if(fit_5_obs){
source("R/RunModels.r", local=TRUE)
#Select model parameters
stan_controls = list(
iter = 2000,
chains = 4,
cores = 4,
control = list(max_treedepth = 20)
)
#Fit Canham hierarchical model to 5 observations
#This fit takes a while, expect 50+ hours.
source("R/CanhamSpec.r")
dataset <- readRDS("data/gar2in_5_rstan_data.rds")
fit <- build_rstan_model(stan_controls,
model_name = "Canham",
stan_file_name = "stan/Canham_SingleSpecies.stan",
est_method = "samp",
rstan_data = dataset)
saveRDS(fit, file="output/data/Single_Species_Fit_Canham_GRecondita_5obs.rds")
#Wrangle file names for diagnostic plots
diag_list <- single_species_controls$diagnostic_list
for(j in 1:length(diag_list)){
diag_list[[j]]$name <- paste(diag_list[[j]]$name, "GRecondita_5obs", sep="_")
}
#Plot diagnostics
plot_all_diagnostics(fit, int_method = 3,
diag_list,
inc_warmup = FALSE,
RHat_name = "GRecon_Canham_5obs")
rm(fit)
#Fit constant growth model to 5 observations
dataset <- readRDS("data/gar2in_5_rstan_data.rds")
source("R/ConstSpec.r")
fit <- build_rstan_model(stan_controls,
model_name = "Constant",
stan_file_name = "stan/ConstGrowth_SingleSpecies.stan",
est_method = "samp",
rstan_data = dataset)
saveRDS(fit, file="output/data/Single_Species_Fit_Const_GRecondita_5obs.rds")
#Wrangle file names for diagnostic plots
diag_list <- single_species_controls$diagnostic_list
for(j in 1:length(diag_list)){
diag_list[[j]]$name <- paste(diag_list[[j]]$name, "GRecondita_5obs", sep="_")
}
#Plot diagnostics
plot_all_diagnostics(fit, int_method = 3,
diag_list,
inc_warmup = FALSE,
RHat_name = "GRecon_Const_5obs")
rm(fit)
}
```
The next step involves extracting estimated final sizes from different models to compare RMSE. We will look at the hierarchical Canham model fit to 6 observations as a baseline in-sample RMSE, and compare that to each of the three models fit to the first 5 observations. For the hierarchical models (Canham and constant growth) we use the estimated 5th size as the starting point for extrapolation to the 6th. For the species-average model we use the observed 5th size instead.
```{r}
extract_predictive_RMSE <- TRUE
if(extract_predictive_RMSE){
#Extract size estimates from models
dataset_all_obs <- readRDS("data/gar2in_rstan_data.rds")
tibble_6th_obs <- tibble(
S_obs = dataset_all_obs$S_obs,
treeid_factor = dataset_all_obs$treeid_factor,
census = dataset_all_obs$census,
census_interval = dataset_all_obs$census_interval
) %>%
filter(census == 6)
fit_save_paths <- c(
"output/data/Single_Species_Fit_Canham_GRecondita_5obs.rds",
"output/data/Single_Species_Fit_Const_GRecondita_5obs.rds",
"output/data/Single_Species_Fit_Canham_GRecondita.rds"
)
fit_name <- c(
"hierarchical_canham_5_obs",
"hierarchical_const_5_obs",
"hierarchical_canham_6_obs"
)
fit_title <- c(
"Hierarchical Canham fit to 5 obs.",
"Hierarchical constant fit to 5 obs.",
"Hierarchical Canham fit to 6 obs."
)
model_spec_name_vec <- c(
"R/CanhamSpec.r",
"R/ConstSpec.r",
"R/CanhamSpec.r"
)
dataset <- readRDS("data/gar2in_5_rstan_data.rds")
source("R/EstimateWrangling.r")
pars_list <- list()
starting_size <- tibble()
for(i in 1:length(fit_name)){
fit <- readRDS(fit_save_paths[i])
print(paste0("Wrangling fit: ", fit_name[i]))
temp_return_data <- extract_est_tibble(fit,
model_name = fit_name[i],
model_spec_name = model_spec_name_vec[i],
dataset,
S_6_obs = tibble_6th_obs$S_obs,
census_interval = tibble_6th_obs$census_interval)
starting_size <- rbind(starting_size, temp_return_data$starting_size)
pars_list <- c(pars_list, temp_return_data$par_list)
}
starting_size$S_final_hat <- 0
source("R/Analyse.r")
#Estimate size for 6th census based on RK4 projection forward from 5th size estimate
for(i in 1:nrow(starting_size)){
if((i-1) %% 50 == 0){
print(paste0("Row: ", i))
}
source(starting_size$model_spec_name[i])
#Estimation from individual model
pars <- pars_list[[i]]$pars
temp_size_est <- rk4_est(S_0 = starting_size$S_5[i],
growth = growth_function,
pars = pars,
step_size = starting_size$census_interval[i]/5,
N_step = 5)
starting_size$S_final_hat[i] <- temp_size_est[5]
}
#Calculate error
starting_size$S_final_error <- starting_size$S_final_hat - starting_size$S_6_obs
par(mfrow = c(3,1))
#Histogram of errors
for(i in 1:length(fit_name)){
temp_size <- starting_size %>%
filter(model_name == fit_name[i])
rmse <- sqrt(sum(temp_size$S_final_error^2)/nrow(temp_size))
hist(temp_size$S_final_error,
main = paste0(fit_title[i], " RMSE: ", signif(rmse, digits = 3)),
xlab = "Error est - obs (cm)",
xlim = c(min(starting_size$S_final_error), max(starting_size$S_final_error)),
col = "lightblue")
print(paste0("RMSE for ", fit_name[i], " is: ", rmse))
}
par(mfrow = c(1,1))
}
```
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Multi-individual demonstration
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In this section we will expand our simulation analysis to multiple individuals so we are able to compare the performance of integration methods for species-level parameters, and across individuals.
First we specify which models to run.
As written, this code can use MCMC to fit each of constant, exponential, and Canham models to simulated data from those respective growth functions at the multi-species level using the Euler integration method. We will work with the Canham model for the paper.
To change integration method, modify the int_method parameter.
To modify the number of cores, chains, and samples go to the single_species_controls object in the relevant spec file.
To exclude particular models remove them from the model_list.
Model fits will be saved to output/data/ as .rds files. In the next section the samples will be extracted and transformed into something usable.
```{r}
rstan_data_locations <- list(
single_species_data = "output/data/SingleSpecies_RStan",
single_individual_data = "output/data/SingleInd_RStan"
)
model_list <- list(
exp_model = list( #The exponential model uses the census interval as the integration step size for numerical methods
model_spec = "R/ExponentialSpec.r",
model_name = "Exponential",
model_file = "Exp"
))#,
storage_list <- list(
canham_model = list(#This model requires a step size argument in the data passed to it.
model_spec = "R/CanhamSpec.r",
model_name = "Canham",
model_file = "Canham",
step_size = 1
),
const_model = list( #All numerical methods are equivalent for constant growth
model_spec = "R/ConstSpec.r",
model_name = "Constant",
model_file = "Const"
),
exp_model = list( #The exponential model uses the census interval as the integration step size for numerical methods
model_spec = "R/ExponentialSpec.r",
model_name = "Exponential",
model_file = "Exp"
)
)
#Specify which hierarchy of model to build
level_of_models <- list(
single_species = TRUE,
single_individual = FALSE
)
#Choose "opt" for optimizing() or "samp" for MCMC. This will apply to all models.
est_method <- "samp"
#Choose 1: Euler, 2: Midpoint, 3: RK4. This will apply to all models.
int_method <- 3
#Change to TRUE to run models
run_models <- TRUE
```
Now we construct the model building controls.
```{r}
model_build_controls <- list(
model_list = model_list,
level_of_models = level_of_models,
rstan_data_locations = rstan_data_locations,
plot_diagnostics = TRUE,
est_method = est_method,
int_method = int_method,
inc_warmup = FALSE #Include warmup samples in diagnostic plots
)
```
The models are ready to run.
Depending on the integration method, model, chains, and cores this step can take from a few minutes to a few hours. Example fit files are provided if you wish to skip the computationally intensive bit. Running the files as given with chains in sequence rather than parallel takes about an hour.
```{r}
if(run_models){
source("R/RunModels.r", local=TRUE)
build_all_models(model_build_controls)
}
```
Once the fits are run we can extract the estimates and build output.
```{r}
#Sample extraction controls
extract_samples <- FALSE
#Control list to be passed to sample extraction
data_extract_controls <- list(
model_list = model_list,
level_of_models = level_of_models,
rstan_data_locations = rstan_data_locations,
est_method = est_method,
int_method = int_method
)
#Analysis controls
analyse_models <- TRUE
analysis_controls <- list(
model_list = model_list,
est_method = est_method,
int_method = int_method
)
#Run extraction and analysis
if(extract_samples){
source("R/EstimateWrangling.r", local=TRUE)
build_extracted_samples(data_extract_controls)
}
if(analyse_models){
source("R/Analyse.r", local=TRUE)
model_analysis(analysis_controls)
}
```
The compiled data files can be loaded in and examined to see the CIs for species-level estimates, individual-level parameter estimates, and size-level values. Further analysis can be done on the extracted values.