forked from rcvanwijk/ReproducibilityPMX
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Reproducibility_PMX_BCG-CoV-19_Statistical analysis framework_v1.Rmd
8487 lines (6841 loc) · 462 KB
/
Reproducibility_PMX_BCG-CoV-19_Statistical analysis framework_v1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
params:
sub_title:
input: text
value: 'blank_placeholder'
title: "Unblinded data report to Data Safety Monitoring Board"
subtitle: "`r params$sub_title`"
author:
- "Dr. Rob van Wijk, Laurynas Mockeliunas, MSc., Prof. Dr. Ulrika Simonsson \n *Department of Pharmaceutical Biosciences, Uppsala University, Sweden*"
date: "`r format(Sys.time(), '%A, %d %B %Y')`"
output:
pdf_document:
extra_dependencies: ["float"]
includes:
in_header: header_unblinded.tex
toc: yes
knit: (
function(inputFile, encoding) {
pSubTitle <- "BCG re-vaccination for healthcare workers in SARS-CoV-2 pandemic"
rmarkdown::render(
input = inputFile,
encoding = encoding,
params = list(sub_title = pSubTitle),
output_file = paste(format(Sys.time(), "%Y%m%d_Unblinded_datareporttoDSMB_"), pSubTitle, sep = '')) })
editor_options:
chunk_output_type: console
---
```{r setup, include=FALSE}
#clean slate
rm(list = ls(all = T))
# start the clock to report how long compilation took
ptm <- proc.time()
knitr::opts_chunk$set(warning = F, message = F, fig.height = 3.5,comment = '>', tidy.opts = list(width.cutoff = 70), tidy = TRUE)
options("scipen"=100, "digits"=4, knitr.kable.NA = '', fig.pos = "!H", out.extra = "") #, knitr.kable.NA to have empty cells for NAs
###
# Specify report type
# and arm label
#
# CLOSED variable is used throughout report through IF statements and labelling to recreate the same figures and tables
# for reports to the closed session of the trial meetings (trial data reported per arm, blinded during the trial [arm1, arm2],
# and unblinded after data lock), or the open session of the trial meetings (trial data only reported at summary level).
###
CLOSED = T #is the report open or closed --> also change:
# 1. title (line 6)
# 2. header.tex (line 15)
# 3. filename report (line 25)
Arm1 = 'placebo'
Arm2 = 'BCG'
#load libraries
library(tidyverse)
library(viridis)
library(knitr)
library(kableExtra)
library(survival)
library(survminer)
library(gridExtra)
library(data.table)
library(ggtext)
#specify plotting theme
theme_set(theme_bw())
theme_update(panel.grid = element_blank())
#define master directory through IF statement based on who/which machine is running the script (downstream folder structure is the same)
WD <- ifelse(Sys.info()['nodename']=='RCVW','X:/1.Postdoc','//user.uu.se/bmci/FBV-Users/robva847/Documents/1.Postdoc')
#define directory of input master datasets
wd1 <- '/3.Projects/4.BCG/1.Input/Screening_and_Enrolment_(V40_18_Jun_2020)_20211025-105845/'
wd2 <- '/3.Projects/4.BCG/1.Input/Events_(V10_01_Jun_2020)_20211025-110415/'
wd3 <- '/3.Projects/4.BCG/1.Input/Lab_results_(V10_01_Jul_2020)_20211025-110001/'
wd4 <- '/3.Projects/4.BCG/1.Input/Follow_up_(V40_22_Feb_2021)_20211025-110214/'
#define what to evaluate in the knitting of the report (variables called per chunk)
eval_dsmb = T
echo_dsmb = F
summary_table <- function(variable){
#function to count number of null
fun1 <- function(x){length(x)}
#function to count number of null
fun2 <- function(x){sum(x == 0, na.rm=T)}
#function to count number of na
fun3 <- function(x){sum(is.na(x))}
#create summary table for number of data
rbind(`total number` = sapply(variable, fun1)) %>%
#add number of NA, number of null
rbind(`number of null` = sapply(variable, fun2)) %>%
rbind(`number of NA` = sapply(variable, fun3)) %>%
#add min, max, quantiles, median,
rbind(minimum = sapply(variable, quantile, probs = 0, na.rm = T)) %>%
rbind(maximum = sapply(variable, quantile, probs = 1, na.rm = T)) %>%
rbind(`1st quantile` = sapply(variable, quantile, probs = 0.25, na.rm = T)) %>%
rbind(`3rd quantile` = sapply(variable, quantile, probs = 0.75, na.rm = T)) %>%
rbind(median = sapply(variable, quantile, probs = 0.5, na.rm = T)) %>%
#mean, add standard deviation
rbind(mean = sapply(variable, mean, na.rm = T)) %>%
rbind(`standard deviation` = sapply(variable, sd, na.rm = T))
}
```
```{r enrolment input data, eval = eval_dsmb, echo = echo_dsmb}
#####################################################
# Input from Screening and Enrolment master dataset #
#####################################################
#read data from source
df1_full <- read.table(file = paste(WD, wd1, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')
```
```{r event input data, eval = eval_dsmb, echo = echo_dsmb}
####################################
# Input from Events master dataset #
####################################
#read data from source
df2_full <- read.table(file = paste(WD, wd2, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')
```
```{r lab input data, eval = eval_dsmb, echo = echo_dsmb}
##################################
# Input from Labs master dataset #
##################################
#read data from source
df3_full <- read.table(file = paste(WD, wd3, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')
```
```{r follow-up input data, eval = eval_dsmb, echo = echo_dsmb}
#######################################
# Input from Follow-up master dataset #
#######################################
#read data from source
df4_full <- read.table(file = paste(WD, wd4, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')
```
```{r combine all datasets, eval = eval_dsmb, echo = echo_dsmb}
#combining all master datasets by PID to assign all records (which could have varying Submission.ID values) to their corresponding individual partipicant identified by its unique anonymized PID.
#PID is personal identifier, coded BCGxxxx (see Data Definition Table)
df_full <- df1_full %>%
full_join(df2_full, by = 'PID') %>%
full_join(df3_full, by = 'PID') %>%
full_join(df4_full, by = 'PID') %>%
#combine event dates (respiratory tract infection, injection site reaction, other) into single column
mutate(event_start_all = ifelse(!is.na(event_start), as.character(event_start), ifelse(!is.na(event_start_1), as.character(event_start_1), ifelse(!is.na(event_start_2), as.character(event_start_2), NA)))) %>%
mutate(event_stop_all = ifelse(!is.na(event_stop), as.character(event_stop), ifelse(!is.na(event_stop_1), as.character(event_stop_1), ifelse(!is.na(event_stop_2), as.character(event_stop_2), NA)))) %>%
mutate(event_name_all = ifelse(!is.na(event_name), as.character(event_name), ifelse(!is.na(event_name_1), as.character(event_name_1), ifelse(!is.na(event_name_2), as.character(event_name_2), NA))))
#assign group name for open report (summary level data reporting)
if(CLOSED == F){
df_full$group = 'Both'
}
#check for double names after combining datasets
n1 <- names(df1_full)
n2 <- names(df2_full)
n3 <- names(df3_full)
n4 <- names(df4_full)
c(n1, n2, n3, n4)[duplicated(c(n1, n2, n3, n4)) | duplicated(c(n1, n2, n3, n4), fromLast = T)][order(c(n1, n2, n3, n4)[duplicated(c(n1, n2, n3, n4)) | duplicated(c(n1, n2, n3, n4), fromLast = T)])] #check if no variables of interest are double, as they will be uninformatively named with .x.x or .y.y (PID is exception, this is used for binding)
#group assignment
PID_group <- df_full %>% distinct(PID, group)
#Visual check of event records from the four datasets combined
data.frame(Records = c(df1_full %>% nrow(),
df2_full %>% nrow(),
df3_full %>% nrow(),
df4_full %>% nrow()),
Dataset = factor(c('Screening/\nenrolment',
'Events',
'Lab results',
'Follow-up'),
levels = c('Screening/\nenrolment',
'Events',
'Lab results',
'Follow-up'))) %>%
ggplot(aes(Dataset, Records)) +
geom_bar(aes(fill = Dataset), stat = 'identity') +
# geom_text(aes(y = Records - 500, label = Records), col = 'white', size = 12) +
scale_x_discrete(name = 'Master database') +
scale_fill_viridis(option = 'inferno', end = 0.75, discrete = T, guide = F)
#Visual check of the event numbers categorized by MedDRA lower level term (LLT)
df_full %>%
distinct(PID, event_number, SOC, HLGT, HLT, LLT, PT) %>%
filter(!is.na(SOC)) %>% filter(SOC != '') %>%
group_by(LLT) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
top_n(10) %>%
arrange(n) %>%
mutate(LLT = factor(LLT, levels = LLT)) %>%
data.frame() %>%
ggplot(aes(LLT, n)) +
geom_bar(stat = 'identity') +
coord_flip()
# Get data cut off date from file name (format: "\\3.Projects\\4.BCG\\1.Input\\Screening_and_Enrolment_(V40_18_Jun_2020)_20210223-110510\\")
date_cut_off <- strsplit(wd1, split = '_') %>% #split on underscore to separate info in filename
unlist() %>%
tail(1) %>% #get last element of file name with date and time
strsplit(split = '-') %>% #split on dash to separate date and time
unlist() %>%
head(1) %>% #get date
as.Date(format = '%Y%m%d') #and get in correct date format
```
```{r manual input report, eval = eval_dsmb, echo = echo_dsmb}
################
# MANUAL INPUT #
################
# Manual input is required for limited variables that are not part of the input datasets
#input date of DSMB meeting
date <- "2021-11-09" #format: "yyyy-mm-dd"
#input date of last DSMB meeting
date_last_dsmb <- "2021-07-06" #format: "yyyy-mm-dd"
#input DSMB specific information
n_dsmb <- 6 #number of meetings
#input date of last QC review of data analysis
date_last_review_data_analysis <- '2020-06-29' #format: "yyyy-mm-dd"
reviewer_data_analysis <- 'Dr. Joakim Nyberg'
#input who prepared the report
author_prep <- "Rob van Wijk, Laurynas Mockeliunas, Ulrika Simonsson"
```
```{r check automatic input report, eval = eval_dsmb, echo = echo_dsmb, message = F, warning = F, results = 'hide', include = F}
###############################
# DATA CHECK AUTOMATIC INPUT #
# Run and check results every #
# data transfer and query #
# results to data managers #
###############################
#check final date complete:
df_full %>% filter(PID %in% (df_full %>% filter(is.na(final_date)) %>% distinct(PID, final_date))$PID) %>% distinct(PID, final_date) %>% arrange(PID)
#check for event stop date after event start date
df_full %>% filter(event_stop != '',
event_start > event_stop) %>%
distinct(PID, event_number, event_start, event_stop, LLT) %>%
arrange(PID, event_number)
# check for (ongoing) events after the final date
hs_ongoing_afterfinaldate <- rbind(hs_time_event0_original, hs_time_event0_fu) %>%
distinct(PID, group, event_number, event_start_all, date_vaccination, final_date, onset_week, Time0, Time, HS, event_ongoing, event_status, event_stop) %>% #remove overlapping records from original and follow-up
filter((!is.na(HS) & Time > as.numeric(as.Date(final_date) - as.Date(date_vaccination), unit = 'weeks'))) %>%
mutate(stoptime = as.numeric(as.Date(event_stop ) - as.Date(date_vaccination), unit = 'weeks'))
#check for negative time (one ID has a pregnancy start prior to enrolment which can be ignored)
df_full %>%
filter(event_start_all < date_vaccination) %>%
distinct(PID, event_start_all, date_vaccination)
#check for 'other vaccines' to make sure there is no COVID-19 related vaccine that we won't pick up
df4_bcg_vaccine %>% distinct(bcg_none_other_comment)
#99.9 will be default for missing numerical decimal fields
#999 will be default for missing integer numerical fields
df_missing_data_dec <- df_full[which(df_full == 999, arr.ind = T)[,1],] %>%
select(PID, names(df_full)[unique(which(df_full == 999, arr.ind = T)[,2])]) %>%
filter(!is.na(PID)) %>%
distinct(.keep_all = T) %>% #prevent double records because of follow-up or event records in which the demographics are duplicated
select(!c('Duration..seconds..x.x','Duration..seconds..y', 'req_number')) %>%
head(12) %>%
distinct()
df_missing_data_int <- df_full[which(df_full == 99.9, arr.ind = T)[,1],] %>%
select(PID, names(df_full)[unique(which(df_full == 99.9, arr.ind = T)[,2])]) %>%
filter(!is.na(PID)) %>%
distinct(.keep_all = T) #prevent double records because of follow-up or event records in which the demographics are duplicated
## check for different spellings
df_full %>% filter(str_detect(df_full$event_name, regex('cov', ignore_case = T)) == T) %>% select(event_name) %>% distinct()
## and remove 'COVID-19 negative' instances
spelling0 <- df_full %>% filter(str_detect(df_full$event_name, regex('cov', ignore_case = T)) == T) %>% select(event_name) %>% distinct()
spelling <- spelling0 %>% filter(str_detect(spelling0$event_name, regex('neg', ignore_case = T)) == F)
spelling_nopost <- spelling %>% filter(str_detect(spelling$event_name, regex('post', ignore_case = T)) == F)
spelling_post <- spelling %>% filter(str_detect(spelling$event_name, regex('post', ignore_case = T)) == T)
## check crosslink between c19_positive (follow-up question, positive C19 test) and event recorded (RTI event for COVID-19)
PID_COVID <- df_full %>%
select(PID, event_name, LLT, LLT_code, c19_positive, PT_code) %>%
# filter(event_name %in% spelling$event_name | LLT == 'COVID-19' | c19_confirm == 'Yes' | c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
filter(c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
distinct(PID)
df_full %>%
select(PID, event_name, LLT, c19_positive, event_HS) %>%
# filter(event_name %in% spelling$event_name | LLT == 'COVID-19' | c19_confirm == 'Yes' | c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
filter(c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
filter(!(PID %in% unlist(PID_COVID$PID))) %>% #filter the IDs without a COVID-19 event
distinct(PID, event_name, LLT, c19_positive, event_HS) %>%
arrange(PID)
## check crosslink between post-corona syndrome and corona
post <- df_full %>%
filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380 | PT_code == 10084459) %>% #use both LLT and PT_code for COVID-19 (pneumonia) as well as asymptomatic (asymptomatic COVID-19 leading to long COVID reported)
distinct(PID, event_number, event_name, LLT, date_vaccination) %>% #and find the event number (post viral has the same event number)
inner_join(df_full %>% filter(event_name %in% spelling_post | LLT == 'Post viral fatigue' | LLT == 'Post viral fatigue syndrome' | PT_code == 10057244) %>% select(PID, group, event_number, event_name, LLT, event_start_all, event_stop_all), by = c('PID', 'event_number')) %>% #join with post viral syndrome
distinct(PID, group, event_number, LLT.x, LLT.y, .keep_all = T) #distinct events (x = original event, y = post viral syndrome)
part_post <- post %>% distinct(PID) #participants with post covid syndrom record
part_post_group1 <- post %>% filter(group == Arm1) %>% distinct(PID) #participants with post covid syndrom record
part_post_group2 <- post %>% filter(group == Arm2) %>% distinct(PID) #participants with post covid syndrom record
#check covid-19 LLT term --> should only be 'COVID-19' (10084268) or 'Asymptomatic COVID-19' (10084459) which we don't look at
df_full %>% filter(HLT %in% c('Coronavirus infections', 'Viral lower respiratory tract infections') | HLT_code %in% c(10047468, 10084510)) %>% distinct(LLT, LLT_code, PT, PT_code) #, HLT, HLT_code, HLGT, HLGT_code, SOC, SOC_code
#double check asymptomatic COVID-19 is with HS = 0
df_full %>% filter(LLT == 'Asymptomatic COVID-19' | LLT_code == 10084459) %>% distinct(PID, event_number, event_name, sars_cov_2, event_HS, LLT)
#check for c19 with HS0
df_full %>% filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% filter(event_hs == 0 | event_HS == 0) %>% distinct(PID, event_number, event_name, LLT, event_hs, event_HS) #use both LLT and PT_code for COVID-19 (pneumonia)
## health status definitions:
# event_hs / event_hs_1 / event_hs_2 record the highest health score (HS) for an event (unique event is identified by event_number, which should not overlap --> discrepancies will be discussed)
# for respiratory tract infections (RTIs), event_hs_fu also records the highest HS for the follow-up
# for injection site reactions (ISRs) and other events, event_hs_fu_1 also records the highest HS for the follow-up
# the highest of these two constitutes the highest HS per event and will be reported to the DSMB --> recorded in event_HS variable
## health status over time definitions:
# HS over time is reported for RTI only in health_status (original event form, Onset:Week_5, filled in once) and follow-up (event_rti_fu, follow-up RTI form, week_1:week_12, filled in with a weekly frequency)
# The tables are complementary, e.g. Week_1 in health_status should be the same as week_1 in event_rti_fu.
# The tables will be merged for reporting HS over time
#check non-overlapping HS over time tables by substracting values in Week_x from week_x (to circumvent records that are NA --> not a discrepency, just missing data)
PID_fu <- df2_full %>% filter(week_1 != '' | week_2 != '' | week_3 != '' | week_4 != '' | week_5 != '' | week_6 != '' | week_7 != '' | week_8 != '' | week_9 != '' | week_10 != '' | week_11 != '' | week_12 != '') %>% distinct(PID)
df2_full %>% filter(PID %in% unlist(PID_fu)) %>% select(PID, event_number, Week_1:Week_5, week_1: week_10) %>% filter(as.numeric(Week_1) - as.numeric(week_1) != 0 | as.numeric(Week_2) - as.numeric(week_2) != 0 | as.numeric(Week_3) - as.numeric(week_3) != 0 | as.numeric(Week_4) - as.numeric(week_4) != 0 | as.numeric(Week_5) - as.numeric(week_5) != 0)
df2_full %>% filter(PID %in% unlist(PID_fu)) %>% select(PID, event_number, Week_1:Week_5, week_1: week_10) %>% filter(as.numeric(Week_1) != as.numeric(week_1) | as.numeric(Week_2) != as.numeric(week_2) | as.numeric(Week_3) != as.numeric(week_3) | as.numeric(Week_4) != as.numeric(week_4) | as.numeric(Week_5) != as.numeric(week_5) )
#check for hospitalization events (HS > 3)
df_full[which(df_full$event_HS > 3),] %>% select(PID, event_HS, LLT, event_name_all) %>% distinct() %>% arrange(desc(event_HS))
#check for empty MedDRA LLT records -> email to complete
df_full %>% filter(LLT == '') %>% distinct(PID, event_number, LLT, PT, LLT_code, PT_code) %>% arrange(PID) %>% write.csv('PID_eventno_without_LLT.csv')
#check for inconclusive test results
df_full %>%
filter(sars_cov_2.nr_comment != '') %>%
select(PID, sars_cov_2, sars_cov_2.nr_comment)
df_full %>%
filter(!is.na(sars_cov_2.nr_comment)) %>%
select(PID, sars_cov_2, sars_cov_2.nr_comment)
df_full %>%
filter(igra.nr_comment != '') %>%
select(PID, igra, igra.nr_comment)
df_full %>%
filter(!is.na(igra.nr_comment)) %>%
select(PID, igra, igra.nr_comment)
#check vital signs that are not risk factors explicitly visualized in the report below
for(i in c("BP_systolic", "BP_diastolic", "heart_rate", "resp_rate", "temp", "alcohol_day", "cannabis_week")){
print(df_full %>%
ggplot(aes(.data[[i]])) +
geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
geom_density() +
scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
theme_bw() +
facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}
for(i in c("BP_normal", "hr_normal", "rr_normal", "temp_normal")){
print(df_full %>%
ggplot(aes(.data[[i]])) +
geom_bar(aes(fill = factor(group)), alpha = 0.75) +
scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
theme_bw() +
facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}
# #check other variables that are not risk factors explicitly visualized in the report below
for(i in c("bubble", "patients_seen", "expect_interact", "post_meno", "contracept", "hyster", "bearing_potential", "doctor_4wks", "contact_covid19", "tested_covid19", "contact_tb", "HIVrapid_result", "fluvac", "other_vac", "chemo", "anticyto", "steroids", "covid_meds")){
print(df_full %>%
filter_at(vars(i), all_vars (. != "")) %>%
ggplot(aes(.data[[i]])) +
geom_bar(aes(fill = factor(group)), alpha = 0.75) +
scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
theme_bw() +
facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}
# #check medical history that are not risk factors explicitly visualized in the report below
for(i in c("medhis_tb", "medhis_cerebro", "medhis_cancer" , "medhis_transplant", "medhis_immuno", "medhis_blood", "medhis_rti", "medhis_allergy", "medhis_hey_fever", "medhis_sinus", "medhis_other", "medhis_other1", "medhis_other2")){
print(df_full %>%
filter_at(vars(i), all_vars (. != "")) %>%
ggplot(aes(.data[[i]])) +
geom_bar(aes(fill = factor(group)), alpha = 0.75) +
scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
theme_bw() +
facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}
# #check baseline symptoms
for(i in c("sym_fever_1", "sym_cough_1", "sym_cough_prod", "sym_cold_1", "sym_breath_1", "sym_fatigue_1", "sym_throat_1", "sym_headache_1", "sym_pain_1", "sym_any")){
print(df_full %>%
filter_at(vars(i), all_vars (. != "")) %>%
ggplot(aes(.data[[i]])) +
geom_bar(aes(fill = factor(group)), alpha = 0.75) +
scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
theme_bw() +
facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}
#check for covid medication or steroids or other exclusion criteria
df_full %>% filter(HIVrapid_result == 'Positive' | self_report_HIV == 'HIV_positive') %>% distinct(PID, HIVrapid_result, self_report_HIV, excl3)
df_full %>% filter(covid_meds == 'Yes') %>% distinct(PID, covid_meds, excl8)
df_full %>% filter(steroids == 'Yes') %>% distinct(PID, steroids, excl7)
df_full %>% filter(pregnant_breastfeeding == 'Yes') %>% distinct(PID, pregnant_breastfeeding, excl6)
df_full %>% filter(covid_meds == 'Yes') %>% select(covid_meds, notes) %>% mutate_each(tolower) %>% mutate_each(factor) %>% summary()
df_full %>% filter(steroids == 'Yes') %>% select(steroids, notes) %>% mutate_each(tolower) %>% mutate_each(factor) %>% summary()
```
```{r automatic input enrolment and demographics, eval = eval_dsmb, echo = echo_dsmb, message = F}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Dataset: enrolment/demographics #
##########################################
#input enrolment of participants
part_enrol <- length(unique(df_full$PID)) #number of participants enrolled assumed to be equal to the number of unique PIDs
part_group1 <- length(unique(df_full$PID[df_full$group == Arm1]))
part_group2 <- length(unique(df_full$PID[df_full$group == Arm2]))
#per site
part_enrol_central <- df_full %>% filter(site == 'Central') %>% distinct(PID) %>% nrow()
part_enrol_eden <- df_full %>% filter(site == 'Eden') %>% distinct(PID) %>% nrow()
part_enrol_uct <- df_full %>% filter(site == 'UCT') %>% distinct(PID) %>% nrow()
part_enrol_central_group1 <- df_full %>% filter(site == 'Central', group == Arm1) %>% distinct(PID) %>% nrow()
part_enrol_eden_group1 <- df_full %>% filter(site == 'Eden', group == Arm1) %>% distinct(PID) %>% nrow()
part_enrol_uct_group1 <- df_full %>% filter(site == 'UCT', group == Arm1) %>% distinct(PID) %>% nrow()
part_enrol_central_group2 <- df_full %>% filter(site == 'Central', group == Arm2) %>% distinct(PID) %>% nrow()
part_enrol_eden_group2 <- df_full %>% filter(site == 'Eden', group == Arm2) %>% distinct(PID) %>% nrow()
part_enrol_uct_group2 <- df_full %>% filter(site == 'UCT', group == Arm2) %>% distinct(PID) %>% nrow()
#input follow-up meetings serology
part_followup_10 <- df_full %>% filter(visit_week_id_4 == 10) %>% distinct(PID) %>% nrow()
part_followup_10_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_4 == 10) %>% distinct(PID) %>% nrow()
part_followup_10_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_4 == 10) %>% distinct(PID) %>% nrow()
part_followup_26 <- df_full %>% filter(visit_week_id_4 == 26) %>% distinct(PID) %>% nrow()
part_followup_26_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_4 == 26) %>% distinct(PID) %>% nrow()
part_followup_26_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_4 == 26) %>% distinct(PID) %>% nrow()
part_followup_52 <- df_full %>% filter(visit_week_id_4 == 52) %>% distinct(PID) %>% nrow()
part_followup_52_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_4 == 52) %>% distinct(PID) %>% nrow()
part_followup_52_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_4 == 52) %>% distinct(PID) %>% nrow()
#input follow-up (number of visits)
follow_up_visits <- df_full %>% filter(visit_week_id_4 != '') %>% distinct(PID, visit_week_id_4, visit_date_4) %>% group_by(PID) %>% summarize(number_of_visits = n())
# part_followup_visit_1 <- df_full %>% filter(visit_week_id_4 == 1) %>% distinct(PID) %>% nrow() #actual number with visit week ID 1
part_followup_visit_1 <- follow_up_visits %>% filter(number_of_visits >= 1) %>% nrow()
part_followup_visit_2 <- follow_up_visits %>% filter(number_of_visits >= 2) %>% nrow()
part_followup_visit_3 <- follow_up_visits %>% filter(number_of_visits >= 3) %>% nrow()
part_followup_visit_4 <- follow_up_visits %>% filter(number_of_visits >= 4) %>% nrow()
part_followup_visit_5 <- follow_up_visits %>% filter(number_of_visits >= 5) %>% nrow()
part_followup_visit_6 <- follow_up_visits %>% filter(number_of_visits >= 6) %>% nrow()
part_followup_visit_7 <- follow_up_visits %>% filter(number_of_visits >= 7) %>% nrow()
part_followup_visit_8 <- follow_up_visits %>% filter(number_of_visits >= 8) %>% nrow()
part_followup_visit_9 <- follow_up_visits %>% filter(number_of_visits >= 9) %>% nrow()
part_followup_visit_10 <- follow_up_visits %>% filter(number_of_visits >= 10) %>% nrow()
part_followup_visit_11 <- follow_up_visits %>% filter(number_of_visits >= 11) %>% nrow()
part_followup_visit_12 <- follow_up_visits %>% filter(number_of_visits >= 12) %>% nrow()
part_followup_visit_13 <- follow_up_visits %>% filter(number_of_visits >= 13) %>% nrow()
part_followup_visit_14 <- follow_up_visits %>% filter(number_of_visits >= 14) %>% nrow()
part_followup_visit_15 <- follow_up_visits %>% filter(number_of_visits >= 15) %>% nrow()
part_followup_visit_16 <- follow_up_visits %>% filter(number_of_visits >= 16) %>% nrow()
part_followup_visit_17 <- follow_up_visits %>% filter(number_of_visits >= 17) %>% nrow()
part_followup_visit_18 <- follow_up_visits %>% filter(number_of_visits >= 18) %>% nrow()
part_followup_visit_19 <- follow_up_visits %>% filter(number_of_visits >= 19) %>% nrow()
part_followup_visit_20 <- follow_up_visits %>% filter(number_of_visits >= 20) %>% nrow()
#input follow-up (monthly)
followup_4 <- df_full %>% filter(visit_week_id_4 <= 4) %>% distinct(PID)
followup_8 <- df_full %>% filter(visit_week_id_4 > 4 & visit_week_id_4 <= 8) %>% distinct(PID)
followup_12 <- df_full %>% filter(visit_week_id_4 > 8 & visit_week_id_4 <= 12) %>% distinct(PID)
followup_16 <- df_full %>% filter(visit_week_id_4 > 12 & visit_week_id_4 <= 16) %>% distinct(PID)
followup_20 <- df_full %>% filter(visit_week_id_4 > 16 & visit_week_id_4 <= 20) %>% distinct(PID)
followup_24 <- df_full %>% filter(visit_week_id_4 > 20 & visit_week_id_4 <= 24) %>% distinct(PID)
part_followup_4 <- followup_4 %>% nrow()
part_followup_8 <- followup_8 %>% nrow()
part_followup_12 <- followup_12 %>% nrow()
part_followup_16 <- followup_16 %>% nrow()
part_followup_20 <- followup_20 %>% nrow()
part_followup_24 <- followup_24 %>% nrow()
#demographics
df_demographics_numerical <- df_full %>%
distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
select(weight, height, BMI, age, group, site)
df_demographics_binary <- df_full %>%
distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
select(gender, HIVrapid_result, smoking, group, site) %>%
mutate(gender = factor(gender),
HIVrapid_result = factor(HIVrapid_result),
smoking = factor(smoking))
df_demographics_categorical <- df_full %>%
distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
select(country_birth, country_birth.other_comment,ethnicity, ethnicity.other_comment, education, group, site) %>%
mutate(ethnicity = factor(ifelse(ethnicity != "Other", as.character(ethnicity), as.character(ethnicity.other_comment)))) %>%
mutate(country_birth = factor(ifelse(country_birth != "Other", as.character(country_birth), as.character(country_birth.other_comment)))) %>%
mutate(education = factor(education))
df_work_categorical <- df_full %>%
distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
select(job_category, job_title, unit, work_hours, expect_interact, group, site) %>%
mutate(job_category = factor(job_category),
job_title = factor(tolower(job_title)),
unit = factor(unit),
work_hours = factor(work_hours),
expect_interact = factor(expect_interact))
#risks
df_risk <- df_full %>%
distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
select(PID, age, gender, BMI, height, weight, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, site, expect_interact, smoking)
#summarize continuous risks
sum_age_med <- median(df_risk$age, na.rm=T)
sum_age_lwr <- quantile(df_risk$age, 0.25, na.rm=T)
sum_age_upr <- quantile(df_risk$age, 0.75, na.rm=T)
sum_age_med_group1 <- median(df_risk$age[df_risk$group == Arm1], na.rm=T)
sum_age_lwr_group1 <- quantile(df_risk$age[df_risk$group == Arm1], 0.25, na.rm=T)
sum_age_upr_group1 <- quantile(df_risk$age[df_risk$group == Arm1], 0.75, na.rm=T)
sum_age_med_group2 <- median(df_risk$age[df_risk$group == Arm2], na.rm=T)
sum_age_lwr_group2 <- quantile(df_risk$age[df_risk$group == Arm2], 0.25, na.rm=T)
sum_age_upr_group2 <- quantile(df_risk$age[df_risk$group == Arm2], 0.75, na.rm=T)
sum_BMI_med <- median(df_risk$BMI, na.rm=T)
sum_BMI_lwr <- quantile(df_risk$BMI, 0.25, na.rm=T)
sum_BMI_upr <- quantile(df_risk$BMI, 0.75, na.rm=T)
sum_BMI_med_group1 <- median(df_risk$BMI[df_risk$group == Arm1], na.rm=T)
sum_BMI_lwr_group1 <- quantile(df_risk$BMI[df_risk$group == Arm1], 0.25, na.rm=T)
sum_BMI_upr_group1 <- quantile(df_risk$BMI[df_risk$group == Arm1], 0.75, na.rm=T)
sum_BMI_med_group2 <- median(df_risk$BMI[df_risk$group == Arm2], na.rm=T)
sum_BMI_lwr_group2 <- quantile(df_risk$BMI[df_risk$group == Arm2], 0.25, na.rm=T)
sum_BMI_upr_group2 <- quantile(df_risk$BMI[df_risk$group == Arm2], 0.75, na.rm=T)
sum_smoke_lwr <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0], 0.25, na.rm=T)
sum_smoke_upr <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0], 0.75, na.rm=T)
sum_smoke_lwr_group1 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1], 0.25, na.rm=T)
sum_smoke_upr_group1 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1], 0.75, na.rm=T)
sum_smoke_lwr_group2 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2], 0.25, na.rm=T)
sum_smoke_upr_group2 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2], 0.75, na.rm=T)
#summarize categorical
# if(CLOSED == T){
# placeholder = quo(group) #create placeholder for closed report, to stratify on group
# } else {
# placeholder = quo(trunc(group-1.5)) #use trunc function to round both -0.5 and 0.5 to zero (1-1.5 = -0.5, 2-1.5 = 0.5)--> both arms become 0
# }
df_risk_gender <- df_risk %>%
select(gender, group) %>%
group_by(gender, group) %>%
summarize(count = n()) %>%
rename(category = gender)
df_risk_ethnicity <- df_risk %>%
select(ethnicity, group) %>%
group_by(ethnicity, group) %>%
summarize(count = n()) %>%
rename(category = ethnicity)
df_risk_job_category <- df_risk %>%
select(job_category, group) %>%
group_by(job_category, group) %>%
summarize(count = n()) %>%
rename(category = job_category)
df_risk_categorical_sum <- rbind(df_risk_gender, df_risk_ethnicity, df_risk_job_category)
df_risk_categorical_sum$category <- factor(df_risk_categorical_sum$category, levels = c('Male', 'Female', 'African', 'Caucasian', 'Coloured', 'Indian', 'Other', 'Doctor', 'Nurse', 'Essential_workers', 'Support_staff', 'Frontline_workers'))
#summarize medical risk factors
df_med_risk <- df_risk %>%
pivot_longer(medhis_dm:bcg_scar, names_to = 'risk', values_to = 'value') %>%
mutate(value2 = ifelse(value == 'Yes', 1, ifelse(value == 'No', 0, NA)))
df_med_risk_sum <- df_med_risk %>%
group_by(risk, group) %>%
summarize(prevalence = sum(value2, na.rm=T))
```
```{r automatic input quality, eval = eval_dsmb, echo = echo_dsmb, message = F}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: data management and quality #
##########################################
#Quality management of the data
n_qc <- df_qc_eCRF %>% filter(qc_correct != '') %>% nrow() +
df2_qc_eCRF %>% filter(qc_correct != '') %>% nrow() +
df3_qc_eCRF %>% filter(qc_correct != '') %>% nrow() +
df4_qc_eCRF %>% filter(qc_correct != '') %>% nrow()
#correction needed: yes
n_qc_yes <- df_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() +
df2_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() +
df3_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() +
df4_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow()
#correction needed: no
n_qc_no <- df_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() +
df2_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() +
df3_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() +
df4_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow()
qc_date0 <- df_full %>% filter(qc_correct_1 != '' | qc_correct_2 != '' | qc_correct_3 != '' | qc_correct_4 != '') %>%
pivot_longer(c(qc_date_1, qc_date_2, qc_date_3, qc_date_4), names_to = 'date_name', values_to = 'date') %>%
select(date) %>%
mutate(date = as.character(date)) %>% #in case of factor it cannot be arranged chronologically
na.omit() %>%
unique() %>%
arrange(date)
qc_date <- format(as.Date(qc_date0$date), '%A, %d %B %Y')
#input date of last data review:
date_last_review <- max(as.Date(qc_date0$date), na.rm=T) #assumed to be the last date in the qc_ecf.dat file (ignoring empty records)
```
```{r automatic input vaccinations, eval = eval_dsmb, echo = echo_dsmb, message = F}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: other vaccinations than BCG #
##########################################
# COVID19 vaccination
part_vaccine_C19 <- df_full %>%
filter(vaccine == 'Yes',
Covid_VAC_name != '') %>%
distinct(PID) %>% #select unique PIDs (only first record will be kept!)
nrow()
part_vaccine_C19_group1 <- df_full %>%
filter(group == Arm1) %>%
filter(vaccine == 'Yes',
Covid_VAC_name != '') %>%
distinct(PID) %>% #select unique PIDs (only first record will be kept!)
nrow()
part_vaccine_C19_group2 <- df_full %>%
filter(group == Arm2) %>%
filter(vaccine == 'Yes',
Covid_VAC_name != '') %>%
distinct(PID) %>% #select unique PIDs (only first record will be kept!)
nrow()
#check for adverse events after vaccination date
ae_vaccine <- df_full %>%
filter(vaccine == 'Yes',
Covid_VAC_name == '') %>%
filter(event_start_all > Date_Covid_VAC) %>%
distinct(PID, group, event_number, Covid_VAC_name, event_name_all, LLT, event_HS, Date_Covid_VAC, event_start_all, date_vaccination) %>%
mutate(time_def = as.Date(event_start_all) - as.Date(Date_Covid_VAC)) %>%
select(PID, group, event_number, Covid_VAC_name, Date_Covid_VAC, event_start_all, time_def, event_HS, LLT, event_name_all)
n_ae_vaccine <- ae_vaccine %>%
distinct(PID, event_number, group) %>%
nrow()
n_ae_vaccine_group1 <- ae_vaccine %>%
filter(group == Arm1) %>%
distinct(PID, event_number, group) %>%
nrow()
n_ae_vaccine_group2 <- ae_vaccine %>%
filter(group == Arm2) %>%
distinct(PID, event_number, group) %>%
nrow()
part_ae_vaccine <- ae_vaccine %>%
distinct(PID) %>%
nrow()
part_ae_vaccine_group1 <- ae_vaccine %>%
filter(group == Arm1) %>%
distinct(PID) %>%
nrow()
part_ae_vaccine_group2 <- ae_vaccine %>%
filter(group == Arm2) %>%
distinct(PID) %>%
nrow()
```
```{r automatic input final date, eval = eval_dsmb, echo = echo_dsmb, message = F}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: final date in trial #
##########################################
#get final date (after the final data transfer)
df_final <- df_full %>%
filter(visit_week_id_4 == 52 | visit_week_id_4 == -99 | visit_week_id_4 == 50 | visit_week_id_4 == 53) %>%
mutate(visit_date_4 = as.Date(visit_date_4)) %>%
select(PID, visit_date_4) %>%
arrange(desc(visit_date_4)) %>% #arrange descending so the last date becomes the first (for distinct function coming after)
distinct(PID, .keep_all = T) %>%
rename(final_date = visit_date_4) %>%
#for fatalities, the last event stop is the final date (date of death)
rbind(df_full %>%
filter(event_HS == 7) %>%
distinct(PID, final_date = event_stop_all) %>%
arrange(PID, desc(final_date)) %>%
distinct(PID, .keep_all = T))
df_full <- df_full %>%
left_join(df_final, by = 'PID')
#check if the final date is the same as the withdraw date
# df_full %>% filter(!is.na(withdraw_date)) %>% distinct(withdraw_date, final_date)
#cumulative enrolment
df_max_cum_partweek <- df_full %>%
distinct(PID, .keep_all = T) %>% #remove double records per ID
group_by(date_vaccination, group) %>% #group to count enrollment per date and group
summarise(enrolled = n()) %>%
mutate(week = as.numeric(difftime(Sys.Date(), as.Date(date_vaccination), units = 'week'))) %>%
group_by(group) %>% #group to count cumulative enrollment per group
# arrange(date_vaccination) %>%
mutate(partweek = enrolled * week) %>%
mutate(cum_partweek = cumsum(partweek) / (52*part_enrol)) %>% #relative to total study
group_by(group) %>% summarise(max_cum_partweek = max(cum_partweek))
```
```{r automatic input censoring, eval = eval_dsmb, echo = echo_dsmb, message = F}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: right-censoring for vaccination #
# (1), withdrawal (2), death (3), flu (4)#
##########################################
# Create censoring datasets for downstream Kaplan Meier or survival analysis (date of censoring, status = 0 [because no event], censor category)
df_PID_censor <- df_full %>% #censoring because of vaccination
filter(vaccine == 'Yes') %>%
filter(Date_Covid_VAC != '') %>%
arrange(Date_Covid_VAC) %>% #get first vaccination
select(PID, Date_Covid_VAC, Covid_VAC_name, date_vaccination) %>%
distinct(PID, .keep_all = T) %>%
mutate(Date_Covid_VAC = as.Date(Date_Covid_VAC)) %>%
rename(censor_date = Date_Covid_VAC) %>%
mutate(censor = 1) %>% #vaccination
full_join(df_full %>% #censoring because of withdrawal
filter(withdraw_date != '') %>%
select(PID, withdraw_date, date_vaccination) %>%
distinct(PID, withdraw_date, .keep_all = T) %>%
mutate(withdraw_date = as.Date(withdraw_date)) %>%
rename(censor_date = withdraw_date) %>%
mutate(censor = 2) #withdrawal
) %>%
full_join(df_full %>%
filter(event_HS == 7) %>% #censoring because of death
select(PID, final_date, date_vaccination) %>%
distinct(PID, final_date, .keep_all = T) %>%
rename(censor_date = final_date) %>%
mutate(censor = 3) #death
) %>%
arrange(PID, censor_date) %>%
mutate(time = as.numeric(as.Date(censor_date, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
mutate(status = 0)
df_PID_censor_withflu <- df_full %>% #censoring because of vaccination
filter(vaccine == 'Yes') %>%
filter(Date_Covid_VAC != '') %>%
arrange(Date_Covid_VAC) %>% #get first vaccination
select(PID, Date_Covid_VAC, Covid_VAC_name, date_vaccination) %>%
distinct(PID, .keep_all = T) %>%
mutate(Date_Covid_VAC = as.Date(Date_Covid_VAC)) %>%
rename(censor_date = Date_Covid_VAC) %>%
mutate(censor = 1) %>% #vaccination
full_join(df_full %>% #censoring because of withdrawal
filter(withdraw_date != '') %>%
select(PID, withdraw_date, date_vaccination) %>%
distinct(PID, withdraw_date, .keep_all = T) %>%
mutate(withdraw_date = as.Date(withdraw_date)) %>%
rename(censor_date = withdraw_date) %>%
mutate(censor = 2) #withdrawal
) %>%
full_join(df_full %>%
filter(event_HS == 7) %>% #censoring because of death
select(PID, final_date, date_vaccination) %>%
distinct(PID, final_date, .keep_all = T) %>%
rename(censor_date = final_date) %>%
mutate(censor = 3) #death
) %>%
full_join(df_full %>%
filter(vaccine == 'Yes', #censoring because of fluvac
Covid_VAC_name == '') %>%
distinct(PID, vaccine, Covid_VAC_name, flu_vac_date,date_vaccination) %>%
separate(flu_vac_date, into = c('Date', 'Time'), sep = ' ') %>% #assumed date of vaccination based on received column of the questionnaire
mutate(flu_vac_date1 = as.Date(Date, format = '%d-%m-%Y')) %>%
select(PID, flu_vac_date1, date_vaccination) %>%
distinct(PID, flu_vac_date1, .keep_all = T) %>%
rename(censor_date = flu_vac_date1) %>%
mutate(censor = 4) #fluvac
) %>%
arrange(PID, censor_date) %>%
mutate(time = as.numeric(as.Date(censor_date, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
mutate(status = 0)
#Intention to treat means censoring at withdrawal/death only, no censoring=1 for others (that is PP dataset)
df_PID_censor_withflu_ITT <- df_PID_censor_withflu %>%
filter(censor %in% c(2,3))
```
```{r automatic input efficacy, eval = eval_dsmb, echo = echo_dsmb, message = F}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: efficacy (HS over time) #
##########################################
#input efficacy
#original event form
hs_time_event0_original <- df_full %>%
filter(!is.na(event_start)) %>% #only select RTI events which have a non-NA event_start
select(PID, group, event_number, Onset:Week_5, event_start_all, date_vaccination, final_date, event_status, event_ongoing, event_stop) %>%
arrange(PID, event_start_all) %>% #to make sure the first start date of an event is first
mutate(onset_week = round(as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'weeks'))) %>%
filter(!is.na(Onset)) %>%
filter(Onset != 'N_A') %>% #coding in case of SARS-CoV-2 antibody test positive and Asymptomatic COVID-19
#Onset and Week_1 are for the first week (onset = day 1, Week_1 = day 2-7)
rowwise() %>%
mutate(OnsetWeek1 = as.character(max(as.numeric(as.character(Onset)), as.numeric(as.character(Week_1)), na.rm = T))) %>%
ungroup() %>%
select(PID, group, event_number, OnsetWeek1, Week_2:Week_5, event_start_all, date_vaccination, final_date, event_status, event_ongoing, event_stop, onset_week) %>%
pivot_longer(cols = OnsetWeek1:Week_5, names_to = c('Time0', 'Time00'), names_sep = '_', values_to = 'HS') %>%
mutate(Time = ifelse(Time0 == 'OnsetWeek1', onset_week, onset_week + as.numeric(as.character(Time00)) -1)) %>% #get time in weeks since vaccination minus 1 (because onset and week1 are the same week)
mutate(HS = as.numeric(as.character(HS)),
Time0 = tolower(Time0), #for merging orignal and fu (which have a capital W difference in week)
TimeDate = if_else(Time0 == 'onsetweek1',
as.Date(event_start_all),
as.Date(event_start_all) + 7 * (as.numeric(as.character(Time00))-1))) %>% #to get the actual date for the event week
filter(!is.na(HS)) %>% #remove NAs for HS
distinct()
#transform to integer
column_to_integer <- c("week_1","week_2","week_3","week_4","week_5","week_6","week_7","week_8","week_9","week_10","week_11","week_12")
df_full <- df_full %>% mutate_at(column_to_integer, as.character) %>% mutate_at(column_to_integer, as.numeric)
#follow-up event form
hs_time_event0_fu <- df_full %>%
filter(!is.na(event_start)) %>% #only select RTI events which have a non-NA event_start
select(PID, group, event_number, week_1:week_12, event_start_all, date_vaccination, final_date, event_status, event_ongoing, event_stop) %>%
arrange(PID, event_start_all) %>% #to make sure the first start date of an event is first
mutate(onset_week = round(as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'weeks'))) %>%
filter(!is.na(week_1)) %>%
pivot_longer(cols = week_1:week_12, names_to = c('Time0', 'Time00'), names_sep = '_', values_to = 'HS') %>%
mutate(Time = ifelse(Time0 == 'Onset', onset_week, onset_week + as.numeric(as.character(Time00)) - 1)) %>% #get time in weeks since vaccination minus 1 (because onset and week1 are the same week)
mutate(HS = as.numeric(as.character(HS)),
Time0 = tolower(Time0), #for merging orignal and fu (which have a capital W difference in week)
TimeDate = if_else(Time0 == 'onsetweek1',
as.Date(event_start_all),
as.Date(event_start_all) + 7 * (as.numeric(as.character(Time00))-1))) #to get the actual date for the event week
hs_time_event0 <- rbind(hs_time_event0_original, hs_time_event0_fu) %>%
distinct(PID, group, event_number, event_start_all, date_vaccination, final_date, Time, HS, event_status, event_ongoing, event_stop, TimeDate) %>% #remove overlapping records from original and follow-up
filter(!is.na(HS)) %>% #remove NAs for HS
filter(!(is.na(HS) & Time > as.numeric(as.Date(final_date) - as.Date(date_vaccination), unit = 'weeks'))) #remove records after final date unless HS is not NA (so a specifically recorded HS)
part_hs_time_event <- hs_time_event0 %>% distinct(PID) %>% arrange(PID) #those subjects with an event
#create dataset with 0 values from start to end per event number, then anti-join with the actual event data
hs_time_event_pre <- hs_time_event0 %>%
arrange(PID, event_number, event_start_all) %>% #arrange on event_start also to get the first start date in case of multiple records per event number
distinct(PID, event_number, .keep_all = T) %>%
mutate(PID_eventnumber = paste(PID, event_number, sep = '_')) %>% arrange(PID_eventnumber)
hs_time_event_pre_zero <- data.frame() # create dataset to which the rows will be bound per participant (number of rows differ per participant which makes it challenging)
for(i in 1:length(hs_time_event_pre$PID_eventnumber)){ #~75 seconds
week_seq <- seq(0, round(as.numeric(as.Date(ifelse(is.na(hs_time_event_pre$final_date[i]), date_cut_off, hs_time_event_pre$final_date[i]), origin = "1970-01-01") - as.Date(hs_time_event_pre$date_vaccination[i]), unit = 'weeks'))) #week sequence from start to final date or cut-off date
hs_time_event_pre_zero <- hs_time_event_pre[rep(i, length(week_seq)),] %>% #repeat the first record with the number of added records necessary from 0 to the length of the weekly sequence
mutate(HS = 0) %>% # and health status 0
mutate(Time = week_seq) %>% # assign the weekly sequence
rbind(hs_time_event_pre_zero) %>%
arrange(PID_eventnumber)