-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBootstrapped GAMM.qmd
321 lines (276 loc) · 12 KB
/
Bootstrapped GAMM.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
---
title: "Bootstrapped GAMMs"
format: html
editor: visual
---
# Bootstrapped 95%CIs and ECx from GAMMs
Checked 02/05/2023
Similar to my previous blog on deriving 'Effect Concentrations' from GLMMs, this blog details a way to code bootstrapped 95% CI from GAMMs and their bootstrapped ECx. I know of no method to extract ECx from GAMMs analytically like we can for GLMMs, so here I interpolate from the predictions. This isn't super accurate but the error will be quite small -- make sure to keep the prediction length \>100 though.
This dataset doesn't really require a GAMM, I've just used it for demonstration purposes. I've added an observational-level random effect, which is typically used for tank experiments.
Lets load in some data.
## Running Code
When you click the **Render** button a document will be generated that includes both content and the output of embedded code. You can embed code like this:
```{r}
# Load packages (if not installed Tools->Install Packages)
library(mgcv)
library(lattice)
library(MASS)
library(MuMIn)
data1 <- read.table(file="https://raw.githubusercontent.com/gerard-ricardo/data/master/2015_uppersurface", header= TRUE, dec=",")
#data1
# 2 Labeling and wrangling -----------------------------------------------
data1$sus_sed <- as.numeric(as.character(data1$sus.sed))
data1$log.x <- log10(data1$sus_sed) #log10 trans
data1$fert <- as.numeric(as.character(data1$fert))
data1$suc <- data1$fert
data1$tot <- as.numeric(as.character(data1$tot))
data1$obs <- factor(formatC(1:nrow(data1), flag="0", width = 3))# unique tank ID for later on
data1$log.x <- ifelse(data1$log.x < 0.1, 0, data1$log.x) #Replace neg with zero
```
Next let's fit a regular GAMM for demonstration sake. Here we are fitting a GAMM, and predicting along raw.x in the data.frame df1. We convert the predicted SE to approximate 95% CI by multiplying them by 1.96.
```{r, echo=FALSE, message=FALSE, warning=FALSE, results='hide'}
m2.gamm<-gamm(cbind(suc,(tot - suc))~s(log.x, fx = F, k = 3),
random = list(obs=~1),
family=binomial,
data=data1,
method = "REML") #
df1 <- data.frame(log.x = seq(min(data1$log.x), max(data1$log.x), length = 30)) #setting up new data frame (df) defining log.x values to run
pred.md1 <- predict(m2.gamm, newdata=df1, type = "response", se.fit=T, lpmatrix=TRUE)
df1$pred = pred.md1$fit
plot(data1$log.x,(data1$suc / data1$tot),main="GAM") #second plot
lines(df1$log.x, df1$pred, type = "l", lwd = 2, col = 2, xaxt = "n", las = 1) #plot model mean line
df1$up.ci = pred.md1$fit + 1.96 * pred.md1$se.fit #predicting the data1 95% CI
df1$lo.ci = pred.md1$fit - 1.96 * pred.md1$se.fit #predicting the lower 95% CI
lines(df1$log.x, df1$up.ci, type = "l", lwd = 2, xaxt = "n", las = 1) #plot data1 95% CI
lines(df1$log.x, df1$lo.ci, type = "l", lwd = 2, xaxt = "n", las = 1) #plot lower 95% CI
```
The code used to interpolate the ECx is below. This function calculates the ECx from the maximum of the curve. It simply works by finding the nearest value in the prediction data.frame to X% of the maximum response, and then the corresponding value in the predictor column. If you have a unimodal 'hump-shaped' trend, it will only calculate one ECx (I think).
```{r}
ecx.interp <- function(ecx, df1) {
inhibx <- max(df1$pred) * ecx
nearest_idx <- which.min(abs(df1$pred - inhibx))
df2 <- data.frame(
ecx = df1$log.x[nearest_idx],
lower = df1$log.x[which.min(abs(df1$lo.ci - inhibx))],
data = df1$log.x[which.min(abs(df1$up.ci - inhibx))]
)
return(df2)
}
ecx.interp(0.5, df1) #this is the ECx calculate between the top and 0.
```
Ok, now for a GAMM with bootstrapped 95% CI. Make sure to keep sims = as multiple of 200, otherwise the CIs can't calculate properly. Ideally your finally attempt will want around 1000 otherwise the CIs will look wobbly, like below.
Here we have the code above wrapped into a function called m2.gamm, which outputs the prediction. In the for-loop, we resample the raw data.frame and then apply this function. The prediction is stored into a list and then a data.frame called df3. This data.frame is then ordered and next subset by the 0.025 (2.5%) or 0.975 (97.5%) columns. Because columns are integers, make sure your sims multiplied by these proportions result in an integer e.g. 200.
```{r}
set.seed(123)
lst = list() #initial list
fit_gam <- function(resampled) {
m2.gamm <- gamm(cbind(suc, (tot - suc)) ~ s(log.x, fx = F, k = 3),
random = list(obs = ~1),
family = binomial,
data = resampled,
method = "REML",
verbosePQL = F)
pred.md1 <- predict(m2.gamm, newdata=df1, type = "response",se.fit=T)
out = unname(pred.md1$fit)
# lst[[1]] = dd
# return(lst)
return(out)
} #gam function returns prediction
#Now resample the residuals x times
resid_gam <- residuals(m2.gamm$gam, type = "response")
sims = 200
for (i in 1:sims) {
resampled_resid <- resid_gam[sample(length(resid_gam), replace = TRUE)]
new_response <- m2.gamm$gam$fitted.values + resampled_resid
resampled_data <- data1
resampled_data$suc <- round(pmin(pmax(new_response * resampled_data$tot, 0), resampled_data$tot)) #pmax prevents values less than 0
out <- fit_gam(resampled_data)
lst[[i]] <- out
}
#lst #returns all predictions in list
df3 <- do.call(rbind, lst) #add this to 'loop into a list'
#df3 #so all the predicted fits from the x simulations are stored as rows in this list
```
Now lets see how each resampled curve looks
```{r}
# plot output of predictions
df4 <- as.data.frame(lst)
df4$log.x <- df1$log.x
# df4$log.x
library(tidyverse)
data1.long <- df4 %>%
tidyr::pivot_longer(-log.x, names_to = "factors", values_to = "meas") %>%
data.frame() # keep vec.x, add all other columns to factors , add all their values to meas)
ggplot(data1.long, aes(x = log.x, y = meas, color = "steelblue1", group = factors)) +
geom_line() +
# geom_point(aes(y = meas)) +
labs(x = "Predictor", y = "Response", title = "Multiple Model Fits") +
theme_minimal() +
theme(legend.position = "none")
```
Let's take the 2.5% and the 97.5% lines to see the bootstrapped CIs
```{r}
# ordering and take the bottom 2.5% and top 97.5% lines
eta <- 0.5 * sims
lowerCI <- 0.025 * sims
dataCI <- 0.975 * sims
bb_se1 <- apply(df3, 2, function(X) X[order(X)]) # orders the predictions from lowest to highest
df1$boot.pred <- bb_se1[eta, ] # find the bottom 2.5%
df1$boot_lo <- bb_se1[lowerCI, ] # find the bottom 2.5%
df1$boot_up <- bb_se1[dataCI, ] # find the top 2.5%
# plotting
plot(data1$log.x, (data1$suc / data1$tot), main = "GAM") # second plot
lines(df1$log.x, df1$boot.pred, type = "l", lwd = 2, col = 2, xaxt = "n", las = 1) # plot model mean line
lines(df1$log.x, df1$boot_lo, type = "l", lwd = 2, xaxt = "n", las = 1) # plot data2 95% CI
lines(df1$log.x, df1$boot_up, type = "l", lwd = 2, xaxt = "n", las = 1) # plot lower 95% CI
```
**With that in mind, we can now bootstrap the ECx.**
Just to recap:
1\) we fit a GAMM to the data
2\) we extracted the residuals and resampled them
3\) made new predictions from the resampled residuals
4\) repeated this 200 times
Ok. Let resample just once to see how it look, but keeping everything in a function, which we will need below.
```{r}
set.seed(123)
lst1 <- list() # initial list
fit_gam2 <- function(resampled, ecx) {
m2.gamm <- gamm(cbind(suc, (tot - suc)) ~ s(log.x, fx = F, k = 3),
random = list(obs = ~1),
family = binomial,
data = resampled,
method = "REML",
verbosePQL = F
)
pred <- predict(m2.gamm, newdata = df1, type = "response", se.fit = T)
df2 <- data.frame(log.x = df1$log.x, pred = pred$fit)
ecx.interp2 <- function(ecx, df2) {
inhibx <- max(df2$pred) * ecx
nearest_idx <- which.min(abs(df2$pred - inhibx))
ecx <- df2$log.x[nearest_idx]
return(ecx)
}
out1 <- ecx.interp2(ecx, df2)
out <- unname(out1)
# lst[[1]] = dd
# return(lst)
return(out)
} # gam function returns prediction
fit_gam2(resampled_data, 0.5)
```
Now let's run it in a loop for each resample
```{r}
# Now run in loop for sims
ecx <- 0.5
sims <- 200
for (i in 1:sims) {
# resampled = data2[sample(nrow(data2), replace = TRUE), ]
# out = fit_gam2(resampled, ecx)
# lst[[i]] = out
resampled_resid <- resid_gam[sample(length(resid_gam), replace = TRUE)]
new_response <- m2.gamm$gam$fitted.values + resampled_resid
resampled_data <- data1
resampled_data$suc <- round(pmin(pmax(new_response * resampled_data$tot, 0), resampled_data$tot)) # pmax prevents values less than 0
out <- fit_gam2(resampled_data, ecx)
lst[[i]] <- out
}
# lst #returns all predictions in list
df3 <- unlist(lst)
# df3
hist(df3)
plot(density(df3)) # density plot
df4 <- df3[order(df3)]
df1$boot.pred <- df4[eta] # find the bottom 2.5%
df1$boot_lo <- df4[lowerCI] # find the bottom 2.5%
df1$boot_up <- df4[dataCI] # find the top 2.5%
# df1
```
# Plotting
```{r, message=FALSE, warning=FALSE, results='hide'}
library(ggplot2)
library(tidybayes)
df5 <- data.frame(ecx = df3)
p1 <- ggplot(df5, aes(x = ecx)) +
geom_density(aes(fill = "steelblue4"), alpha = 0.3) +
stat_pointinterval(aes(y = 0.00, x = ecx), .width = c(.66, .95))+
theme_light()
p1 <- p1 + scale_fill_manual(values = c("steelblue4")) +
scale_color_manual(values = c("steelblue4")) + theme(legend.position = "none") #
p1 <- p1 + scale_y_continuous(name = "Density")
p1 <- p1 + coord_cartesian(ylim = c(0.0, 3.5))
p1
```
And if you want everything wrapped into a function
```{r, eval=FALSE}
gamm_ecx <- function(data1, ecx) {
library(mgcv)
library(lattice)
library(MASS)
library(MuMIn)
data1$suc <- as.numeric(as.character(data1$suc))
data1$tot <- as.numeric(as.character(data1$tot))
data1$obs <- factor(formatC(1:nrow(data1), flag = "0", width = 3)) # unique tank ID for later on
lst1 <- list() # initial list
data1$log.x <- ifelse(data1$log.x < 0.1, 0, data1$log.x) # Replace neg with zero
set.seed(123)
m2.gamm <- gamm(cbind(suc, (tot - suc)) ~ s(log.x, fx = F, k = 3),
random = list(obs = ~1),
family = binomial,
data = data1,
method = "REML", verbosePQL = F
) #
resid_gam <- residuals(m2.gamm$gam, type = "response")
df1 <- data.frame(log.x = seq(min(data1$log.x), max(data1$log.x), length = 30)) # setting up new data frame (df) defining log.x values to run
fit_gam2 <- function(resampled, ecx) {
m2.gamm <- gamm(cbind(suc, (tot - suc)) ~ s(log.x, fx = F, k = 3),
random = list(obs = ~1),
family = binomial,
data = resampled,
method = "REML", verbosePQL = F
)
pred <- predict(m2.gamm, newdata = df1, type = "response", se.fit = T)
df2 <- data.frame(log.x = df1$log.x, pred = pred$fit)
ecx.interp2 <- function(ecx, df2) {
inhibx <- max(df2$pred) * ecx
nearest_idx <- which.min(abs(df2$pred - inhibx))
ecx <- df2$log.x[nearest_idx]
return(ecx)
}
out1 <- ecx.interp2(ecx, df2)
out <- unname(out1)
return(out)
} # gam function returns prediction
# fit_gam2(resampled_data, 0.5) #gives the same asnwer as above but in a function
sims <- 200
lst <- list() # initial list
for (i in 1:sims) {
resampled_resid <- resid_gam[sample(length(resid_gam), replace = TRUE)]
new_response <- m2.gamm$gam$fitted.values + resampled_resid
resampled_data <- data1
resampled_data$suc <- round(pmin(pmax(new_response * resampled_data$tot, 0), resampled_data$tot)) # pmax prevents values less than 0
out <- fit_gam2(resampled_data, ecx)
lst[[i]] <- out
}
lst # returns all predictions in list
df3 <- unlist(lst)
#df3
hist(df3)
plot(density(df3)) # density plot
df4 <- df3[order(df3)]
eta <- 0.5 * sims
lowerCI <- 0.025 * sims
dataCI <- 0.975 * sims
boot.pred <- df4[eta] # find the bottom 2.5%
boot_lo <- df4[lowerCI] # find the bottom 2.5%
boot_up <- df4[dataCI] # find the top 2.5%
df5 <- data.frame(boot.pred, boot_lo, boot_up)
#df5
output_list <- list(df4, df5)
return(output_list)
}
boot_ecx <- gamm_ecx(data1, 0.5)
boot_ecx[[1]]
boot_ecx[[2]]
```
If you fit multiple curves, you can then compare their ECx distributions together to assess for evidence of differences.
Happy bootstrapping!