-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRCode_Dataexample.Rmd
230 lines (184 loc) · 9.02 KB
/
RCode_Dataexample.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
---
title: "R-code for reanalysis of the COVID-19 study"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
This file provides the code for the reanalysis mentioned in the Discussion of Friedrich & Friede (2020). The data set is taken from Gautret et al. (2020), https://doi.org/10.1016/j.ijantimicag.2020.105949, Supplementary Table 1.
## Data preparation
Following Gautret et al. (2020), we imput missing outcomes on Day 6 by a last-
observation-carried-forward-approach, i. e. patients with missing PCR are considered positive on Day 6, if they were actually positive the day(s) before.
```{r}
library(Publish)
library(data.table)
library(survival)
library(ggplot2)
library(zoo)
library(survey)
library(lmtest)
library(sandwich)
library(parallel)
# Data preparation
covid <- readRDS("covid.rds")
# Impute missing outcomes on Day 6
covid[, 8:14] <- t(na.locf(t(covid[, 8:14])))
covid <- as.data.table(covid)
# drop unused variables
covid[, c("D0", "D1", "D2", "D3", "D4", "D5", "azi") := NULL]
## re-label outcome variable
covid[, D6 := ifelse(D6 == "NEG", 1, 0)]
setnames(covid, "D6", "outcome")
covid[, hydro := ifelse(hydro == "no", 0, 1)]
setnames(covid, "hydro", "trt")
```
After setting time since onset of symptoms to zero for asymptomatic patients, two patients with
missing time since onset remain. Both patients are classified as URTI and we use the median time since onset (3 days) for URTI-patients to impute these missing values.
```{r}
# timeonset is zero for asymptomatic patients
covid[clinicalstatus == "Asymptomatic", timeonset := 0]
covid[is.na(timeonset)]
# two missing values remain (both URTI), impute using median in the respective group
replace <- covid[, median(timeonset[clinicalstatus=="URTI"], na.rm =T)]
covid[is.na(timeonset), timeonset := replace]
```
## The PS-model
Our PS-model includes sex, age, clinical status and time since onset of symptoms as explanatory variables.
```{r}
# propensity score model
propmod <- glm(trt ~ sex + age + clinicalstatus + timeonset,
family="binomial", data = covid)
# propensity score
covid[, ps := predict(propmod, type = "response")]
```
### Compare propensity score overlap between the two groups
```{r}
data <- data.frame(treatment = covid$trt, score = covid$ps)
d1 <- density(data$score[data$treatment == 1], from = 0, to = 1, n = 200)
d0 <- density(data$score[data$treatment == 0], from = 0, to = 1, n = 200)
d <- data.frame(x = c(d1$x, d0$x), y = c(d1$y, d0$y), treatment = c(rep("Hydroxychloroquine", length(d1$x)),
rep("Control", length(d0$x))))
d$treatment <- factor(d$treatment, levels = c("Hydroxychloroquine", "Control"))
ggplot(d, aes(x = .data$x, y = .data$y)) +
geom_density(stat = "identity", aes(color = .data$treatment, group = .data$treatment, fill = .data$treatment)) +
scale_fill_manual(values = c(rgb(0.8, 0, 0, alpha = 0.5),
rgb(0, 0, 0.8, alpha = 0.5))) +
scale_color_manual(values = c(rgb(0.8, 0, 0, alpha = 0.5),
rgb(0, 0, 0.8, alpha = 0.5))) +
scale_x_continuous("Propensity score", limits = c(0, 1)) +
scale_y_continuous("Density", limits = c(0, max(d$y))) +
theme(axis.text = element_text(size=20), axis.title = element_text(size = 20))+
theme(legend.title = element_blank(),
legend.position = "top",
legend.text = element_text(margin = margin(0, 0.5, 0, 0.1, "cm"), size = 20))+
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))
```
### IPTW analysis
```{r}
# calculate IPTW
covid[trt == 1, w:= 1/ps]
covid[trt == 0, w:= 1/(1-ps)]
## risk difference
# survey package:
msm <- svyglm(outcome ~ trt, design = svydesign(~1, weights = ~w, data = covid))
est.msm <- msm$coefficients["trt"]
stderror.msm <- coeftest(msm, vcov = vcovHC(msm, type = "HC3"))[2,2]
ci.iptw <- c(est.msm - 1.96*stderror.msm, est.msm, est.msm + 1.96*stderror.msm)
names(ci.iptw) <- c("Lower", "Estimate", "Upper")
```
### Simple DR g-computation
The Q-model for the g-computation additionally includes treatment status and the covariate z, which is equal to the IPT weight for the treatment group and equal to the negative IPT weight in the control group.
```{r}
# simple doubly robust model
covid[trt == 0, z:= -w]
covid[trt == 1, z:= w]
## calculate confidence intervals using bootstrap
source("bootstrap_RD.R")
boots <- bootstrap(covid, form = outcome ~ trt + z + age + sex + clinicalstatus + timeonset)
CIboot <- causal.rd(boots$risks, boots$boot)
```
```{r}
# the same with dummy categories for logit(ps)-quintiles
qt <- quantile(log(covid$ps/(1-covid$ps)), probs = seq(0, 1, 1/5))
covid[, logitps := log(ps/(1-ps))]
# create dummy variables
covid[, z1:= as.numeric(logitps < qt[2])]
covid[, z2:= as.numeric(qt[2] < logitps & logitps < qt[3])]
covid[, z3:= as.numeric(qt[3] < logitps & logitps < qt[4])]
covid[, z4:= as.numeric(qt[4] < logitps & logitps < qt[5])]
## calculate confidence intervals using bootstrap
boots.q <- bootstrap(covid, form = outcome ~ trt + z1 + z2 + z3 + z4 + age + sex + clinicalstatus + timeonset)
CIboot.q <- causal.rd(boots.q$risks, boots.q$boot)
```
### Matching
The matching procedure results in a data set with 11 controls and 11 treated
patients, thus discarding 5 controls and 9 treated patients from the analysis.
```{r}
## Matching
library(MatchIt)
ps.mod <- matchit(trt ~ sex + age + clinicalstatus + timeonset, data = covid,
method = "nearest", replace = FALSE, caliper = 0.2, distance = "linear.logit")
m.data <- match.data(ps.mod, distance = "pscore")
m.data <- as.data.table(m.data)
# b: only treated experience the event
b <- m.data[trt ==1 & outcome == 1, .N]
# c: only untreated experience the event
c <- m.data[trt ==0 & outcome == 1, .N]
# n: number of matched pairs
n <- (NROW(m.data)/2)
diff.match <- (b-c)/n
var <- ((b+c)-(c-b)^2/n)/n
ci.match <- c(diff.match - 1.96*var, diff.match, diff.match + 1.96*var)
names(ci.match) <- c("Lower", "Estimate", "Upper")
```
### AIPW
```{r}
library(PSW)
source("psw-code_dataex.R")
form.ps <- "trt ~ sex + age + timeonset + clinicalstatus"
form.out <- "outcome ~ sex + age + timeonset + clinicalstatus"
tmp <- ps.aug(data = covid, form.ps = form.ps, weight = "ATE",form.outcome = form.out, family="binomial")
ci.aipw <- c(tmp$est.risk - 1.96*tmp$std.risk, tmp$est.risk, tmp$est.risk + 1.96*tmp$std.risk)
names(ci.aipw) <- c("Lower", "Estimate", "Upper")
```
### Fitting the models and putting all results in a table
```{r}
# 1.) crude risk difference
rd <- lm(outcome ~ trt, data = covid)
est <- rd$coefficients["trt"]
stderror <- coeftest(rd, vcov = vcovHC(rd, type = "HC3"))[2,2]
ci.unadjusted <- c(est - 1.96*stderror, est, est + 1.96*stderror)
names(ci.unadjusted) <- c("Lower", "Estimate", "Upper")
# 2.) covariate adjustment
lm.adj <- lm(outcome ~ trt + sex + age + clinicalstatus + timeonset, data = covid)
est.a <- lm.adj$coefficients["trt"]
stderror.a <- coeftest(lm.adj, vcov = vcovHC(lm.adj, type = "HC3"))[2,2]
ci.adj <- c(est.a - 1.96*stderror.a, est.a, est.a + 1.96*stderror.a)
names(ci.adj) <- c("Lower", "Estimate", "Upper")
## 3.) PS covariate
lm.PScov <- lm(outcome ~ trt + ps, data = covid)
est.ps <- lm.PScov$coefficients["trt"]
stderror.ps <- coeftest(lm.PScov, vcov = vcovHC(lm.PScov, type = "HC3"))[2,2]
ci.PScov <- c(est.ps - 1.96*stderror.ps, est.ps, est.ps + 1.96*stderror.ps)
names(ci.PScov) <- c("Lower", "Estimate", "Upper")
table <- as.data.frame(rbind(ci.unadjusted, ci.adj, ci.PScov, ci.iptw, ci.match, CIboot, CIboot.q, ci.aipw))
table$method <- c("Crude risk difference", "Covariate adjusted RD", "PS covariate", "IPTW", "Matching", "Simple DR g-computation", "DR using quintile", "AIPW")
knitr::kable(table)
```
```{r}
theme_set(theme_bw())
ggplot(table,aes(Estimate,method)) +
geom_point(size=5, shape=18) +
geom_errorbarh(aes(xmax = Upper, xmin = Lower), height = 0.15, lwd=1) +
geom_vline(xintercept = 0, linetype = "longdash") +
#scale_x_log10()+
theme(axis.text = element_text(size=14), axis.title = element_text(size = 14))+
labs(x="Risk difference", y="Method")
```
As we can see in the figure, all methods yield similar rather large point estimates.
However, the methods differ with respect to statistical significance: The very wide confidence
interval obtained by the simple DR g-computation approach includes 0. As noted in the simulation study in Friedrich & Friede (2020), the simple DR g-computation should not be trusted due to the very wide and often uninformative confidence intervals.
### References
* Friedrich, S and Friede, T (2020). Causal inference methods for small non-randomized studies: Methods and an application in COVID-19.
* Gautret, P., Lagier, J.-C., Parola, P., Meddeb, L., Mailhe, M., Doudier, B., Courjon, J., Giordanengo, V., Vieira, V. E., Dupont, H. T., et al. (2020). Hydroxychloroquine and azithromycin as a treatment of covid-19: results of an open-label non-randomized clinical trial. International journal of antimicrobial agents, page 105949.