Reproducibility_PMX_BCG-CoV-19_Statistical analysis framework_v1.Rmd

---
params:
  sub_title:
    input: text
    value: 'blank_placeholder'
title: "Unblinded data report to Data Safety Monitoring Board" 
subtitle: "`r params$sub_title`"
author: 
- "Dr. Rob van Wijk, Laurynas Mockeliunas, MSc., Prof. Dr. Ulrika Simonsson  \n *Department of Pharmaceutical Biosciences, Uppsala University, Sweden*"  
date: "`r format(Sys.time(), '%A, %d %B %Y')`"
output:
  pdf_document: 
        extra_dependencies: ["float"]
        includes:
            in_header: header_unblinded.tex
        toc: yes
knit: (
  function(inputFile, encoding) { 
    pSubTitle <- "BCG re-vaccination for healthcare workers in SARS-CoV-2 pandemic"
    
    rmarkdown::render( 
      input       = inputFile, 
      encoding    = encoding, 
      params      = list(sub_title = pSubTitle),      
      output_file = paste(format(Sys.time(), "%Y%m%d_Unblinded_datareporttoDSMB_"), pSubTitle, sep = '')) })
editor_options: 
  chunk_output_type: console
---


```{r setup, include=FALSE}
#clean slate
rm(list = ls(all = T))

# start the clock to report how long compilation took
ptm <- proc.time()

knitr::opts_chunk$set(warning = F, message = F, fig.height = 3.5,comment = '>', tidy.opts = list(width.cutoff = 70), tidy = TRUE)
options("scipen"=100, "digits"=4, knitr.kable.NA = '', fig.pos = "!H", out.extra = "") #, knitr.kable.NA to have empty cells for NAs

### 
# Specify report type
# and arm label
#
# CLOSED variable is used throughout report through IF statements and labelling to recreate the same figures and tables
# for reports to the closed session of the trial meetings (trial data reported per arm, blinded during the trial [arm1, arm2], 
# and unblinded after data lock), or the open session of the trial meetings (trial data only reported at summary level).
### 

CLOSED = T #is the report open or closed --> also change:
  # 1. title (line 6)
  # 2. header.tex (line 15)
  # 3. filename report (line 25)
Arm1 = 'placebo'
Arm2 = 'BCG'

#load libraries

library(tidyverse)
library(viridis)
library(knitr)
library(kableExtra)
library(survival)
library(survminer) 
library(gridExtra)
library(data.table)
library(ggtext) 

#specify plotting theme
theme_set(theme_bw())
theme_update(panel.grid = element_blank())

#define master directory through IF statement based on who/which machine is running the script (downstream folder structure is the same)
WD <- ifelse(Sys.info()['nodename']=='RCVW','X:/1.Postdoc','//user.uu.se/bmci/FBV-Users/robva847/Documents/1.Postdoc')

#define directory of input master datasets
wd1 <- '/3.Projects/4.BCG/1.Input/Screening_and_Enrolment_(V40_18_Jun_2020)_20211025-105845/' 
wd2 <- '/3.Projects/4.BCG/1.Input/Events_(V10_01_Jun_2020)_20211025-110415/'
wd3 <- '/3.Projects/4.BCG/1.Input/Lab_results_(V10_01_Jul_2020)_20211025-110001/'
wd4 <- '/3.Projects/4.BCG/1.Input/Follow_up_(V40_22_Feb_2021)_20211025-110214/'

#define what to evaluate in the knitting of the report (variables called per chunk)
eval_dsmb = T
echo_dsmb = F

summary_table <- function(variable){
  
  #function to count number of null 
  
  fun1 <- function(x){length(x)}  
  
  #function to count number of null 
  
  fun2 <- function(x){sum(x == 0, na.rm=T)}
  
  #function to count number of na 
  
  fun3 <- function(x){sum(is.na(x))}
  
  #create summary table for number of data
  
    rbind(`total number` = sapply(variable, fun1)) %>%
    
  #add number of NA, number of null
    
    rbind(`number of null` = sapply(variable, fun2)) %>% 
    rbind(`number of NA` = sapply(variable, fun3)) %>% 
    
  #add min, max, quantiles, median, 
    rbind(minimum = sapply(variable, quantile, probs = 0, na.rm = T)) %>%
    rbind(maximum = sapply(variable, quantile, probs = 1, na.rm = T)) %>%
    rbind(`1st quantile` = sapply(variable, quantile, probs = 0.25, na.rm = T)) %>%
    rbind(`3rd quantile` = sapply(variable, quantile, probs = 0.75, na.rm = T)) %>%
    rbind(median = sapply(variable, quantile, probs = 0.5, na.rm = T)) %>%
 
  #mean, add standard deviation
    
    rbind(mean = sapply(variable, mean, na.rm = T)) %>%
    rbind(`standard deviation` = sapply(variable, sd, na.rm = T)) 
}

```

```{r enrolment input data, eval = eval_dsmb, echo = echo_dsmb} 
#####################################################
# Input from Screening and Enrolment master dataset #
#####################################################

#read data from source

df1_full <- read.table(file = paste(WD, wd1, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')

```


```{r event input data, eval = eval_dsmb, echo = echo_dsmb} 
####################################
# Input from Events master dataset #
####################################

#read data from source

df2_full <- read.table(file = paste(WD, wd2, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')


```

```{r lab input data, eval = eval_dsmb, echo = echo_dsmb} 
##################################
# Input from Labs master dataset #
##################################

#read data from source

df3_full <- read.table(file = paste(WD, wd3, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')


```

```{r follow-up input data, eval = eval_dsmb, echo = echo_dsmb} 
#######################################
# Input from Follow-up master dataset #
#######################################

#read data from source

df4_full <- read.table(file = paste(WD, wd4, 'table.dat', sep = ''), fileEncoding = 'UTF-8-BOM', header = T, sep = ',')

```

```{r combine all datasets, eval = eval_dsmb, echo = echo_dsmb}
#combining all master datasets by PID to assign all records (which could have varying Submission.ID values) to their corresponding individual partipicant identified by its unique anonymized PID.
#PID is personal identifier, coded BCGxxxx (see Data Definition Table)
  
df_full <- df1_full %>%
  full_join(df2_full, by = 'PID') %>%
  full_join(df3_full, by = 'PID') %>%
  full_join(df4_full, by = 'PID') %>% 
  
  #combine event dates (respiratory tract infection, injection site reaction, other) into single column 
  mutate(event_start_all = ifelse(!is.na(event_start), as.character(event_start), ifelse(!is.na(event_start_1), as.character(event_start_1), ifelse(!is.na(event_start_2), as.character(event_start_2), NA)))) %>% 
  mutate(event_stop_all = ifelse(!is.na(event_stop), as.character(event_stop), ifelse(!is.na(event_stop_1), as.character(event_stop_1), ifelse(!is.na(event_stop_2), as.character(event_stop_2), NA)))) %>% 
  mutate(event_name_all = ifelse(!is.na(event_name), as.character(event_name), ifelse(!is.na(event_name_1), as.character(event_name_1), ifelse(!is.na(event_name_2), as.character(event_name_2), NA)))) 

#assign group name for open report (summary level data reporting)
if(CLOSED == F){
 df_full$group = 'Both'
}

#check for double names after combining datasets

n1 <- names(df1_full)
n2 <- names(df2_full)
n3 <- names(df3_full)
n4 <- names(df4_full)

c(n1, n2, n3, n4)[duplicated(c(n1, n2, n3, n4)) | duplicated(c(n1, n2, n3, n4), fromLast = T)][order(c(n1, n2, n3, n4)[duplicated(c(n1, n2, n3, n4)) | duplicated(c(n1, n2, n3, n4), fromLast = T)])] #check if no variables of interest are double, as they will be uninformatively named with .x.x or .y.y (PID is exception, this is used for binding)

#group assignment
PID_group <- df_full %>% distinct(PID, group)

#Visual check of event records from the four datasets combined

data.frame(Records = c(df1_full %>% nrow(),
                       df2_full %>% nrow(),
                       df3_full %>% nrow(),
                       df4_full %>% nrow()),
           Dataset = factor(c('Screening/\nenrolment',
                       'Events',
                       'Lab results',
                       'Follow-up'),
                       levels = c('Screening/\nenrolment',
                                  'Events',
                                  'Lab results',
                                  'Follow-up'))) %>%
  ggplot(aes(Dataset, Records)) +
  geom_bar(aes(fill = Dataset), stat = 'identity') +
  # geom_text(aes(y = Records - 500, label = Records), col = 'white', size = 12) +
  scale_x_discrete(name = 'Master database') +
  scale_fill_viridis(option = 'inferno', end = 0.75, discrete = T, guide = F) 

#Visual check of the event numbers categorized by MedDRA lower level term (LLT) 

df_full %>% 
  distinct(PID, event_number, SOC, HLGT, HLT, LLT, PT) %>% 
  filter(!is.na(SOC)) %>% filter(SOC != '') %>% 
  group_by(LLT) %>% 
  summarize(n = n()) %>% 
  arrange(desc(n)) %>% 
  top_n(10) %>% 
  arrange(n) %>%  
  mutate(LLT = factor(LLT, levels = LLT)) %>% 
  data.frame() %>% 
  ggplot(aes(LLT, n)) + 
  geom_bar(stat = 'identity') + 
  coord_flip()

# Get data cut off date from file name (format: "\\3.Projects\\4.BCG\\1.Input\\Screening_and_Enrolment_(V40_18_Jun_2020)_20210223-110510\\")
date_cut_off <- strsplit(wd1, split = '_') %>% #split on underscore to separate info in filename 
  unlist() %>% 
  tail(1) %>% #get last element of file name with date and time
  strsplit(split = '-') %>%  #split on dash to separate date and time
  unlist() %>% 
  head(1) %>% #get date
  as.Date(format = '%Y%m%d') #and get in correct date format

```


```{r manual input report, eval = eval_dsmb, echo = echo_dsmb}

################
# MANUAL INPUT #
################

# Manual input is required for limited variables that are not part of the input datasets

#input date of DSMB meeting

date <- "2021-11-09" #format: "yyyy-mm-dd"

#input date of last DSMB meeting

date_last_dsmb <- "2021-07-06" #format: "yyyy-mm-dd"

#input DSMB specific information

n_dsmb <- 6 #number of meetings

#input date of last QC review of data analysis

date_last_review_data_analysis <- '2020-06-29'  #format: "yyyy-mm-dd"
reviewer_data_analysis <- 'Dr. Joakim Nyberg' 
  
#input who prepared the report

author_prep <- "Rob van Wijk, Laurynas Mockeliunas, Ulrika Simonsson"


```

```{r check automatic input report, eval = eval_dsmb, echo = echo_dsmb, message = F, warning = F, results = 'hide', include = F}

###############################
# DATA CHECK AUTOMATIC INPUT  #
# Run and check results every #
#   data transfer and query   #
#   results to data managers  #
###############################

#check final date complete:

df_full %>% filter(PID %in% (df_full %>% filter(is.na(final_date)) %>% distinct(PID, final_date))$PID) %>% distinct(PID, final_date) %>% arrange(PID)

#check for event stop date after event start date

df_full %>% filter(event_stop != '', 
                   event_start > event_stop) %>% 
  distinct(PID, event_number, event_start, event_stop, LLT) %>% 
  arrange(PID, event_number)


# check for (ongoing) events after the final date
hs_ongoing_afterfinaldate <- rbind(hs_time_event0_original, hs_time_event0_fu) %>%
  distinct(PID, group, event_number, event_start_all, date_vaccination, final_date, onset_week, Time0, Time, HS, event_ongoing, event_status, event_stop) %>% #remove overlapping records from original and follow-up
  filter((!is.na(HS) & Time > as.numeric(as.Date(final_date) - as.Date(date_vaccination), unit = 'weeks'))) %>%
  mutate(stoptime = as.numeric(as.Date(event_stop ) - as.Date(date_vaccination), unit = 'weeks'))

#check for negative time (one ID has a pregnancy start prior to enrolment which can be ignored)

df_full %>%
  filter(event_start_all < date_vaccination) %>%
  distinct(PID, event_start_all, date_vaccination)

#check for 'other vaccines' to make sure there is no COVID-19 related vaccine that we won't pick up  

df4_bcg_vaccine %>% distinct(bcg_none_other_comment)

#99.9 will be default for missing numerical decimal fields
#999 will be default for missing integer numerical fields

df_missing_data_dec <- df_full[which(df_full == 999, arr.ind = T)[,1],] %>%
  select(PID, names(df_full)[unique(which(df_full == 999, arr.ind = T)[,2])]) %>%
  filter(!is.na(PID)) %>%
  distinct(.keep_all = T) %>% #prevent double records because of follow-up or event records in which the demographics are duplicated
  select(!c('Duration..seconds..x.x','Duration..seconds..y', 'req_number')) %>%
  head(12) %>% 
  distinct()

df_missing_data_int <- df_full[which(df_full == 99.9, arr.ind = T)[,1],] %>%
  select(PID, names(df_full)[unique(which(df_full == 99.9, arr.ind = T)[,2])]) %>%
  filter(!is.na(PID)) %>%
  distinct(.keep_all = T) #prevent double records because of follow-up or event records in which the demographics are duplicated

## check for different spellings 
df_full %>% filter(str_detect(df_full$event_name, regex('cov', ignore_case = T)) == T) %>% select(event_name) %>% distinct() 
## and remove 'COVID-19 negative' instances
spelling0 <- df_full %>% filter(str_detect(df_full$event_name, regex('cov', ignore_case = T)) == T) %>% select(event_name) %>% distinct() 
spelling <- spelling0 %>% filter(str_detect(spelling0$event_name, regex('neg', ignore_case = T)) == F) 
spelling_nopost <- spelling %>% filter(str_detect(spelling$event_name, regex('post', ignore_case = T)) == F)
spelling_post <- spelling %>% filter(str_detect(spelling$event_name, regex('post', ignore_case = T)) == T) 

## check crosslink between c19_positive (follow-up question, positive C19 test) and event recorded (RTI event for COVID-19)

PID_COVID <- df_full %>%
  select(PID, event_name, LLT, LLT_code, c19_positive, PT_code) %>%
  # filter(event_name %in% spelling$event_name | LLT == 'COVID-19' | c19_confirm == 'Yes' | c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
  filter(c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  distinct(PID)

df_full %>%
  select(PID, event_name, LLT, c19_positive, event_HS) %>%
  # filter(event_name %in% spelling$event_name | LLT == 'COVID-19' | c19_confirm == 'Yes' | c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
  filter(c19_positive == 'Yes') %>% #select C19 status from events or from follow-up question
  filter(!(PID %in% unlist(PID_COVID$PID))) %>% #filter the IDs without a COVID-19 event
  distinct(PID, event_name, LLT, c19_positive, event_HS) %>%
  arrange(PID)

## check crosslink between post-corona syndrome and corona

post <- df_full %>% 
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380 | PT_code == 10084459) %>% #use both LLT and PT_code for COVID-19 (pneumonia) as well as asymptomatic (asymptomatic COVID-19 leading to long COVID reported)
  distinct(PID, event_number, event_name, LLT, date_vaccination) %>% #and find the event number (post viral has the same event number)
  inner_join(df_full %>% filter(event_name %in% spelling_post | LLT == 'Post viral fatigue' | LLT == 'Post viral fatigue syndrome' | PT_code == 10057244) %>% select(PID, group, event_number, event_name, LLT, event_start_all, event_stop_all), by = c('PID', 'event_number')) %>% #join with post viral syndrome
  distinct(PID, group, event_number, LLT.x, LLT.y, .keep_all = T) #distinct events (x = original event, y = post viral syndrome)

part_post <- post %>% distinct(PID) #participants with post covid syndrom record
part_post_group1 <- post %>% filter(group == Arm1) %>% distinct(PID) #participants with post covid syndrom record
part_post_group2 <- post %>% filter(group == Arm2) %>% distinct(PID) #participants with post covid syndrom record

#check covid-19 LLT term --> should only be 'COVID-19' (10084268) or 'Asymptomatic COVID-19' (10084459) which we don't look at

df_full %>% filter(HLT %in% c('Coronavirus infections', 'Viral lower respiratory tract infections') | HLT_code %in% c(10047468, 10084510)) %>% distinct(LLT, LLT_code, PT, PT_code) #, HLT, HLT_code, HLGT, HLGT_code, SOC, SOC_code

#double check asymptomatic COVID-19 is with HS = 0
df_full %>% filter(LLT == 'Asymptomatic COVID-19' | LLT_code == 10084459) %>% distinct(PID, event_number, event_name, sars_cov_2, event_HS, LLT)

#check for c19 with HS0

df_full %>%     filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380)   %>% filter(event_hs == 0 | event_HS == 0) %>% distinct(PID, event_number, event_name, LLT, event_hs, event_HS) #use both LLT and PT_code for COVID-19 (pneumonia)

## health status definitions:
# event_hs / event_hs_1 / event_hs_2 record the highest health score (HS) for an event (unique event is identified by event_number, which should not overlap --> discrepancies will be discussed)
# for respiratory tract infections (RTIs), event_hs_fu also records the highest HS for the follow-up 
# for injection site reactions (ISRs) and other events, event_hs_fu_1 also records the highest HS for the follow-up 
# the highest of these two constitutes the highest HS per event and will be reported to the DSMB --> recorded in event_HS variable
## health status over time definitions:
# HS over time is reported for RTI only in health_status (original event form, Onset:Week_5, filled in once) and follow-up (event_rti_fu, follow-up RTI form, week_1:week_12, filled in with a weekly frequency)
# The tables are complementary, e.g. Week_1 in health_status should be the same as week_1 in event_rti_fu. 
# The tables will be merged for reporting HS over time 

#check non-overlapping HS over time tables by substracting values in Week_x from week_x (to circumvent records that are NA --> not a discrepency, just missing data)

PID_fu <- df2_full %>% filter(week_1 != '' | week_2 != '' | week_3 != '' | week_4 != '' | week_5 != '' | week_6 != '' | week_7 != '' | week_8 != '' | week_9 != '' | week_10 != '' | week_11 != '' | week_12 != '') %>% distinct(PID)

df2_full %>% filter(PID %in% unlist(PID_fu)) %>% select(PID, event_number, Week_1:Week_5, week_1: week_10) %>% filter(as.numeric(Week_1) - as.numeric(week_1) != 0 | as.numeric(Week_2) - as.numeric(week_2) != 0 | as.numeric(Week_3) - as.numeric(week_3) != 0 | as.numeric(Week_4) - as.numeric(week_4) != 0 | as.numeric(Week_5) - as.numeric(week_5) != 0)

df2_full %>% filter(PID %in% unlist(PID_fu)) %>% select(PID, event_number, Week_1:Week_5, week_1: week_10) %>% filter(as.numeric(Week_1) != as.numeric(week_1) | as.numeric(Week_2) != as.numeric(week_2)  | as.numeric(Week_3) != as.numeric(week_3)  | as.numeric(Week_4) != as.numeric(week_4) | as.numeric(Week_5) != as.numeric(week_5) )

#check for hospitalization events (HS > 3)
df_full[which(df_full$event_HS > 3),] %>% select(PID, event_HS, LLT, event_name_all) %>% distinct() %>% arrange(desc(event_HS))

#check for empty MedDRA LLT records -> email to complete
df_full %>% filter(LLT == '') %>% distinct(PID, event_number, LLT, PT, LLT_code, PT_code) %>% arrange(PID) %>% write.csv('PID_eventno_without_LLT.csv')

#check for inconclusive test results

df_full %>%
  filter(sars_cov_2.nr_comment != '') %>%
  select(PID, sars_cov_2, sars_cov_2.nr_comment) 

df_full %>%
  filter(!is.na(sars_cov_2.nr_comment)) %>%
  select(PID, sars_cov_2, sars_cov_2.nr_comment) 
  
df_full %>%
  filter(igra.nr_comment != '') %>%
  select(PID, igra, igra.nr_comment)

df_full %>%
  filter(!is.na(igra.nr_comment)) %>%
  select(PID, igra, igra.nr_comment)

#check vital signs that are not risk factors explicitly visualized in the report below
  
for(i in c("BP_systolic", "BP_diastolic", "heart_rate", "resp_rate", "temp", "alcohol_day", "cannabis_week")){
  print(df_full %>%
ggplot(aes(.data[[i]])) +
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() +
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  theme_bw() +
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}

for(i in c("BP_normal", "hr_normal", "rr_normal", "temp_normal")){
  print(df_full %>%
ggplot(aes(.data[[i]])) +
  geom_bar(aes(fill = factor(group)), alpha = 0.75) +
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  theme_bw() +
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}

# #check other variables that are not risk factors explicitly visualized in the report below

for(i in c("bubble", "patients_seen", "expect_interact", "post_meno", "contracept", "hyster", "bearing_potential", "doctor_4wks", "contact_covid19", "tested_covid19", "contact_tb", "HIVrapid_result", "fluvac", "other_vac", "chemo", "anticyto", "steroids", "covid_meds")){
  print(df_full %>%
  filter_at(vars(i), all_vars (. != "")) %>%
ggplot(aes(.data[[i]])) +
  geom_bar(aes(fill = factor(group)), alpha = 0.75) +
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  theme_bw() +
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}

# #check medical history that are not risk factors explicitly visualized in the report below

for(i in c("medhis_tb", "medhis_cerebro", "medhis_cancer" , "medhis_transplant",  "medhis_immuno", "medhis_blood", "medhis_rti", "medhis_allergy", "medhis_hey_fever", "medhis_sinus", "medhis_other", "medhis_other1", "medhis_other2")){
  print(df_full %>%
  filter_at(vars(i), all_vars (. != "")) %>%
  ggplot(aes(.data[[i]])) +
  geom_bar(aes(fill = factor(group)), alpha = 0.75) +
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  theme_bw() +
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}

# #check baseline symptoms

for(i in c("sym_fever_1", "sym_cough_1", "sym_cough_prod", "sym_cold_1", "sym_breath_1", "sym_fatigue_1", "sym_throat_1", "sym_headache_1", "sym_pain_1", "sym_any")){
  print(df_full %>%
  filter_at(vars(i), all_vars (. != "")) %>%
  ggplot(aes(.data[[i]])) +
  geom_bar(aes(fill = factor(group)), alpha = 0.75) +
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  theme_bw() +
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2'))))
}

#check for covid medication or steroids or other exclusion criteria

df_full %>% filter(HIVrapid_result == 'Positive' | self_report_HIV == 'HIV_positive') %>% distinct(PID, HIVrapid_result, self_report_HIV, excl3)
df_full %>% filter(covid_meds == 'Yes') %>% distinct(PID, covid_meds, excl8)
df_full %>% filter(steroids == 'Yes') %>% distinct(PID, steroids, excl7)
df_full %>% filter(pregnant_breastfeeding == 'Yes') %>% distinct(PID, pregnant_breastfeeding, excl6)

df_full %>% filter(covid_meds == 'Yes') %>% select(covid_meds, notes) %>% mutate_each(tolower) %>% mutate_each(factor) %>% summary()
df_full %>% filter(steroids == 'Yes') %>% select(steroids, notes) %>% mutate_each(tolower) %>% mutate_each(factor) %>% summary()

```

```{r automatic input enrolment and demographics, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Dataset: enrolment/demographics        #
##########################################

#input enrolment of participants

part_enrol <- length(unique(df_full$PID)) #number of participants enrolled assumed to be equal to the number of unique PIDs 
part_group1 <- length(unique(df_full$PID[df_full$group == Arm1]))
part_group2 <- length(unique(df_full$PID[df_full$group == Arm2]))

#per site

part_enrol_central <- df_full %>% filter(site == 'Central') %>% distinct(PID) %>% nrow()
part_enrol_eden <- df_full %>% filter(site == 'Eden') %>% distinct(PID) %>% nrow()
part_enrol_uct <- df_full %>% filter(site == 'UCT') %>% distinct(PID) %>% nrow()
  
part_enrol_central_group1 <- df_full %>% filter(site == 'Central', group == Arm1) %>% distinct(PID) %>% nrow()
part_enrol_eden_group1 <- df_full %>% filter(site == 'Eden', group == Arm1) %>% distinct(PID) %>% nrow()
part_enrol_uct_group1 <- df_full %>% filter(site == 'UCT', group == Arm1) %>% distinct(PID) %>% nrow()

part_enrol_central_group2 <- df_full %>% filter(site == 'Central', group == Arm2) %>% distinct(PID) %>% nrow()
part_enrol_eden_group2 <- df_full %>% filter(site == 'Eden', group == Arm2) %>% distinct(PID) %>% nrow()
part_enrol_uct_group2 <- df_full %>% filter(site == 'UCT', group == Arm2) %>% distinct(PID) %>% nrow()

#input follow-up meetings serology

part_followup_10 <- df_full %>% filter(visit_week_id_4 == 10) %>% distinct(PID) %>% nrow()
part_followup_10_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_4 == 10) %>% distinct(PID) %>% nrow()
part_followup_10_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_4 == 10) %>% distinct(PID) %>% nrow()
part_followup_26 <- df_full %>% filter(visit_week_id_4 == 26) %>% distinct(PID) %>% nrow()
part_followup_26_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_4 == 26) %>% distinct(PID) %>% nrow()
part_followup_26_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_4 == 26) %>% distinct(PID) %>% nrow()
part_followup_52 <- df_full %>% filter(visit_week_id_4 == 52) %>% distinct(PID) %>% nrow()
part_followup_52_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_4 == 52) %>% distinct(PID) %>% nrow()
part_followup_52_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_4 == 52) %>% distinct(PID) %>% nrow()


#input follow-up (number of visits)

follow_up_visits <- df_full %>% filter(visit_week_id_4 != '') %>% distinct(PID, visit_week_id_4, visit_date_4) %>% group_by(PID) %>% summarize(number_of_visits = n())

# part_followup_visit_1 <- df_full %>% filter(visit_week_id_4 == 1) %>% distinct(PID) %>% nrow() #actual number with visit week ID 1

part_followup_visit_1 <- follow_up_visits %>% filter(number_of_visits >= 1) %>% nrow()
part_followup_visit_2 <- follow_up_visits %>% filter(number_of_visits >= 2) %>% nrow()
part_followup_visit_3 <- follow_up_visits %>% filter(number_of_visits >= 3) %>% nrow()
part_followup_visit_4 <- follow_up_visits %>% filter(number_of_visits >= 4) %>% nrow()
part_followup_visit_5 <- follow_up_visits %>% filter(number_of_visits >= 5) %>% nrow()
part_followup_visit_6 <- follow_up_visits %>% filter(number_of_visits >= 6) %>% nrow()
part_followup_visit_7 <- follow_up_visits %>% filter(number_of_visits >= 7) %>% nrow()
part_followup_visit_8 <- follow_up_visits %>% filter(number_of_visits >= 8) %>% nrow()
part_followup_visit_9 <- follow_up_visits %>% filter(number_of_visits >= 9) %>% nrow()
part_followup_visit_10 <- follow_up_visits %>% filter(number_of_visits >= 10) %>% nrow()
part_followup_visit_11 <- follow_up_visits %>% filter(number_of_visits >= 11) %>% nrow()
part_followup_visit_12 <- follow_up_visits %>% filter(number_of_visits >= 12) %>% nrow()
part_followup_visit_13 <- follow_up_visits %>% filter(number_of_visits >= 13) %>% nrow()
part_followup_visit_14 <- follow_up_visits %>% filter(number_of_visits >= 14) %>% nrow()
part_followup_visit_15 <- follow_up_visits %>% filter(number_of_visits >= 15) %>% nrow()
part_followup_visit_16 <- follow_up_visits %>% filter(number_of_visits >= 16) %>% nrow()
part_followup_visit_17 <- follow_up_visits %>% filter(number_of_visits >= 17) %>% nrow()
part_followup_visit_18 <- follow_up_visits %>% filter(number_of_visits >= 18) %>% nrow()
part_followup_visit_19 <- follow_up_visits %>% filter(number_of_visits >= 19) %>% nrow()
part_followup_visit_20 <- follow_up_visits %>% filter(number_of_visits >= 20) %>% nrow()


#input follow-up (monthly)

followup_4 <- df_full %>% filter(visit_week_id_4 <= 4) %>% distinct(PID) 
followup_8 <- df_full %>% filter(visit_week_id_4 > 4 & visit_week_id_4 <= 8) %>% distinct(PID) 
followup_12 <- df_full %>% filter(visit_week_id_4 > 8 & visit_week_id_4 <= 12) %>% distinct(PID) 
followup_16 <- df_full %>% filter(visit_week_id_4 > 12 & visit_week_id_4 <= 16) %>% distinct(PID) 
followup_20 <- df_full %>% filter(visit_week_id_4 > 16 & visit_week_id_4 <= 20) %>% distinct(PID) 
followup_24 <- df_full %>% filter(visit_week_id_4 > 20 & visit_week_id_4 <= 24) %>% distinct(PID) 

part_followup_4 <- followup_4 %>% nrow()
part_followup_8 <- followup_8 %>% nrow()
part_followup_12 <- followup_12 %>% nrow()
part_followup_16 <- followup_16 %>% nrow()
part_followup_20 <- followup_20 %>% nrow()
part_followup_24 <- followup_24 %>% nrow()

#demographics

df_demographics_numerical <- df_full %>%
  distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
  select(weight, height, BMI, age, group, site)

df_demographics_binary <- df_full %>% 
  distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
  select(gender, HIVrapid_result, smoking, group, site) %>% 
  mutate(gender = factor(gender),
         HIVrapid_result = factor(HIVrapid_result),
         smoking = factor(smoking)) 

df_demographics_categorical <- df_full %>% 
  distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
  select(country_birth, country_birth.other_comment,ethnicity, ethnicity.other_comment, education, group, site) %>% 
  mutate(ethnicity = factor(ifelse(ethnicity != "Other", as.character(ethnicity), as.character(ethnicity.other_comment)))) %>%
  mutate(country_birth = factor(ifelse(country_birth != "Other", as.character(country_birth), as.character(country_birth.other_comment)))) %>%
  mutate(education = factor(education)) 

df_work_categorical  <- df_full %>%
  distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
  select(job_category, job_title, unit, work_hours, expect_interact, group, site) %>% 
  mutate(job_category = factor(job_category),
         job_title = factor(tolower(job_title)),
         unit = factor(unit),
         work_hours = factor(work_hours),
         expect_interact = factor(expect_interact)) 
#risks

df_risk <- df_full %>%
  distinct(PID, .keep_all = T) %>% #select unique PIDs to prevent double counting of IDs with multiple records (of for example events) (keeps only first record!)
  select(PID, age, gender, BMI, height, weight, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, site, expect_interact, smoking) 

#summarize continuous risks

sum_age_med <- median(df_risk$age, na.rm=T)
sum_age_lwr <- quantile(df_risk$age, 0.25, na.rm=T)
sum_age_upr <- quantile(df_risk$age, 0.75, na.rm=T)

sum_age_med_group1 <- median(df_risk$age[df_risk$group == Arm1], na.rm=T)
sum_age_lwr_group1 <- quantile(df_risk$age[df_risk$group == Arm1], 0.25, na.rm=T)
sum_age_upr_group1 <- quantile(df_risk$age[df_risk$group == Arm1], 0.75, na.rm=T)

sum_age_med_group2 <- median(df_risk$age[df_risk$group == Arm2], na.rm=T)
sum_age_lwr_group2 <- quantile(df_risk$age[df_risk$group == Arm2], 0.25, na.rm=T)
sum_age_upr_group2 <- quantile(df_risk$age[df_risk$group == Arm2], 0.75, na.rm=T)

sum_BMI_med <- median(df_risk$BMI, na.rm=T)
sum_BMI_lwr <- quantile(df_risk$BMI, 0.25, na.rm=T)
sum_BMI_upr <- quantile(df_risk$BMI, 0.75, na.rm=T)

sum_BMI_med_group1 <- median(df_risk$BMI[df_risk$group == Arm1], na.rm=T)
sum_BMI_lwr_group1 <- quantile(df_risk$BMI[df_risk$group == Arm1], 0.25, na.rm=T)
sum_BMI_upr_group1 <- quantile(df_risk$BMI[df_risk$group == Arm1], 0.75, na.rm=T)

sum_BMI_med_group2 <- median(df_risk$BMI[df_risk$group == Arm2], na.rm=T)
sum_BMI_lwr_group2 <- quantile(df_risk$BMI[df_risk$group == Arm2], 0.25, na.rm=T)
sum_BMI_upr_group2 <- quantile(df_risk$BMI[df_risk$group == Arm2], 0.75, na.rm=T)

sum_smoke_lwr <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0], 0.25, na.rm=T)
sum_smoke_upr <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0], 0.75, na.rm=T)
sum_smoke_lwr_group1 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1], 0.25, na.rm=T)
sum_smoke_upr_group1 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1], 0.75, na.rm=T)
sum_smoke_lwr_group2 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2], 0.25, na.rm=T)
sum_smoke_upr_group2 <- quantile(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2], 0.75, na.rm=T)
#summarize categorical

# if(CLOSED == T){
#   placeholder = quo(group) #create placeholder for closed report, to stratify on group
# } else {
#   placeholder = quo(trunc(group-1.5)) #use trunc function to round both -0.5 and 0.5 to zero (1-1.5 = -0.5, 2-1.5 = 0.5)--> both arms become 0
# } 

df_risk_gender <- df_risk %>%
  select(gender, group) %>%
  group_by(gender, group) %>%
  summarize(count = n()) %>%
  rename(category = gender)

df_risk_ethnicity <- df_risk %>%
  select(ethnicity, group) %>%
  group_by(ethnicity, group) %>%
  summarize(count = n()) %>%
  rename(category = ethnicity)

df_risk_job_category <- df_risk %>%
  select(job_category, group) %>%
  group_by(job_category, group) %>%
  summarize(count = n()) %>%
  rename(category = job_category)

df_risk_categorical_sum <- rbind(df_risk_gender, df_risk_ethnicity, df_risk_job_category) 

df_risk_categorical_sum$category <- factor(df_risk_categorical_sum$category, levels = c('Male', 'Female', 'African', 'Caucasian', 'Coloured', 'Indian', 'Other', 'Doctor', 'Nurse', 'Essential_workers', 'Support_staff', 'Frontline_workers'))

#summarize medical risk factors
df_med_risk <- df_risk %>%
  pivot_longer(medhis_dm:bcg_scar, names_to = 'risk', values_to = 'value') %>%
  mutate(value2 = ifelse(value == 'Yes', 1, ifelse(value == 'No', 0, NA)))

df_med_risk_sum <- df_med_risk %>%
  group_by(risk, group) %>%
  summarize(prevalence = sum(value2, na.rm=T))

```


```{r automatic input quality, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: data management and quality     #
##########################################

#Quality management of the data

n_qc <- df_qc_eCRF %>% filter(qc_correct != '') %>% nrow() + 
  df2_qc_eCRF %>% filter(qc_correct != '') %>% nrow() + 
  df3_qc_eCRF %>% filter(qc_correct != '') %>% nrow() + 
  df4_qc_eCRF %>% filter(qc_correct != '') %>% nrow() 

#correction needed: yes
n_qc_yes <- df_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() + 
  df2_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() + 
  df3_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() + 
  df4_qc_eCRF %>% filter(qc_correct == 'Yes') %>% nrow() 

#correction needed: no
n_qc_no <- df_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() + 
  df2_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() + 
  df3_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() + 
  df4_qc_eCRF %>% filter(qc_correct == 'No') %>% nrow() 

qc_date0 <- df_full %>% filter(qc_correct_1 != '' | qc_correct_2 != '' | qc_correct_3 != '' | qc_correct_4 != '') %>% 
  pivot_longer(c(qc_date_1, qc_date_2, qc_date_3, qc_date_4), names_to = 'date_name', values_to = 'date') %>%
  select(date) %>%
  mutate(date = as.character(date)) %>% #in case of factor it cannot be arranged chronologically
  na.omit() %>%
  unique() %>%
  arrange(date)

qc_date <- format(as.Date(qc_date0$date), '%A, %d %B %Y')

#input date of last data review:
 
date_last_review <- max(as.Date(qc_date0$date), na.rm=T) #assumed to be the last date in the qc_ecf.dat file (ignoring empty records) 

```

```{r automatic input vaccinations, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: other vaccinations than BCG     #
##########################################

# COVID19 vaccination

part_vaccine_C19 <- df_full %>%
  filter(vaccine == 'Yes', 
         Covid_VAC_name != '') %>% 
  distinct(PID) %>% #select unique PIDs (only first record will be kept!)
  nrow()

part_vaccine_C19_group1 <- df_full %>%
  filter(group == Arm1) %>% 
  filter(vaccine == 'Yes', 
         Covid_VAC_name != '') %>% 
  distinct(PID) %>% #select unique PIDs (only first record will be kept!)
  nrow()

part_vaccine_C19_group2 <- df_full %>%
  filter(group == Arm2) %>% 
  filter(vaccine == 'Yes', 
         Covid_VAC_name != '') %>% 
  distinct(PID) %>% #select unique PIDs (only first record will be kept!)
  nrow()
 
#check for adverse events after vaccination date

ae_vaccine <- df_full %>% 
   filter(vaccine == 'Yes', 
         Covid_VAC_name == '') %>% 
  filter(event_start_all > Date_Covid_VAC) %>% 
  distinct(PID, group, event_number, Covid_VAC_name, event_name_all, LLT, event_HS, Date_Covid_VAC, event_start_all, date_vaccination) %>% 
  mutate(time_def = as.Date(event_start_all) - as.Date(Date_Covid_VAC)) %>% 
  select(PID, group, event_number, Covid_VAC_name, Date_Covid_VAC, event_start_all, time_def, event_HS, LLT, event_name_all)

n_ae_vaccine <- ae_vaccine %>% 
  distinct(PID, event_number, group) %>% 
  nrow()

n_ae_vaccine_group1 <- ae_vaccine %>% 
  filter(group == Arm1) %>% 
  distinct(PID, event_number, group) %>% 
  nrow()
n_ae_vaccine_group2 <- ae_vaccine %>% 
  filter(group == Arm2) %>% 
  distinct(PID, event_number, group) %>% 
  nrow()

part_ae_vaccine <- ae_vaccine %>% 
  distinct(PID) %>% 
  nrow()
part_ae_vaccine_group1 <- ae_vaccine %>% 
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_vaccine_group2 <- ae_vaccine %>% 
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()

```

```{r automatic input final date, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: final date in trial             #
##########################################


#get final date (after the final data transfer)

df_final <- df_full %>% 
  filter(visit_week_id_4 == 52 | visit_week_id_4 == -99 | visit_week_id_4 == 50  | visit_week_id_4 == 53) %>% 
  mutate(visit_date_4 = as.Date(visit_date_4)) %>% 
  select(PID, visit_date_4) %>% 
  arrange(desc(visit_date_4)) %>% #arrange descending so the last date becomes the first (for distinct function coming after)
  distinct(PID, .keep_all = T) %>% 
  rename(final_date = visit_date_4) %>% 
  #for fatalities, the last event stop is the final date (date of death)
  rbind(df_full %>% 
          filter(event_HS == 7) %>% 
          distinct(PID, final_date = event_stop_all) %>% 
          arrange(PID, desc(final_date)) %>% 
          distinct(PID, .keep_all = T))
df_full <- df_full %>% 
  left_join(df_final, by = 'PID')

#check if the final date is the same as the withdraw date
# df_full %>% filter(!is.na(withdraw_date)) %>% distinct(withdraw_date, final_date)

#cumulative enrolment
df_max_cum_partweek <- df_full %>%
  distinct(PID, .keep_all = T) %>% #remove double records per ID
  group_by(date_vaccination, group) %>% #group to count enrollment per date and group
  summarise(enrolled = n()) %>%
  mutate(week = as.numeric(difftime(Sys.Date(), as.Date(date_vaccination), units = 'week'))) %>%
  group_by(group) %>% #group to count cumulative enrollment per group
  # arrange(date_vaccination) %>%
  mutate(partweek = enrolled * week) %>%
  mutate(cum_partweek = cumsum(partweek) / (52*part_enrol)) %>% #relative to total study
  group_by(group) %>% summarise(max_cum_partweek = max(cum_partweek))

```

```{r automatic input censoring, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: right-censoring for vaccination #
# (1), withdrawal (2), death (3), flu (4)#
##########################################

# Create censoring datasets for downstream Kaplan Meier or survival analysis (date of censoring, status = 0 [because no event], censor category)

df_PID_censor <- df_full %>% #censoring because of vaccination
  filter(vaccine == 'Yes') %>% 
  filter(Date_Covid_VAC  != '') %>% 
  arrange(Date_Covid_VAC) %>% #get first vaccination
  select(PID, Date_Covid_VAC, Covid_VAC_name, date_vaccination) %>% 
  distinct(PID, .keep_all = T) %>%
  mutate(Date_Covid_VAC = as.Date(Date_Covid_VAC)) %>% 
  rename(censor_date = Date_Covid_VAC) %>% 
  mutate(censor = 1) %>% #vaccination
  full_join(df_full %>% #censoring because of withdrawal
              filter(withdraw_date != '') %>% 
              select(PID, withdraw_date, date_vaccination) %>% 
              distinct(PID, withdraw_date, .keep_all = T) %>% 
              mutate(withdraw_date = as.Date(withdraw_date)) %>% 
              rename(censor_date = withdraw_date) %>% 
              mutate(censor = 2) #withdrawal
  ) %>% 
  full_join(df_full %>% 
              filter(event_HS == 7) %>% #censoring because of death
              select(PID, final_date, date_vaccination) %>% 
              distinct(PID, final_date, .keep_all = T)  %>% 
              rename(censor_date = final_date) %>% 
              mutate(censor = 3) #death
  ) %>% 
  arrange(PID, censor_date) %>% 
  mutate(time = as.numeric(as.Date(censor_date, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% 
  mutate(status = 0) 
   
df_PID_censor_withflu <- df_full %>% #censoring because of vaccination
  filter(vaccine == 'Yes') %>% 
  filter(Date_Covid_VAC  != '') %>% 
  arrange(Date_Covid_VAC) %>% #get first vaccination
  select(PID, Date_Covid_VAC, Covid_VAC_name, date_vaccination) %>% 
  distinct(PID, .keep_all = T) %>%
  mutate(Date_Covid_VAC = as.Date(Date_Covid_VAC)) %>% 
  rename(censor_date = Date_Covid_VAC) %>% 
  mutate(censor = 1) %>% #vaccination
  full_join(df_full %>% #censoring because of withdrawal
              filter(withdraw_date != '') %>% 
              select(PID, withdraw_date, date_vaccination) %>% 
              distinct(PID, withdraw_date, .keep_all = T) %>% 
              mutate(withdraw_date = as.Date(withdraw_date)) %>% 
              rename(censor_date = withdraw_date) %>% 
              mutate(censor = 2) #withdrawal
  ) %>% 
  full_join(df_full %>% 
              filter(event_HS == 7) %>% #censoring because of death
              select(PID, final_date, date_vaccination) %>% 
              distinct(PID, final_date, .keep_all = T)  %>% 
              rename(censor_date = final_date) %>% 
              mutate(censor = 3) #death
  ) %>% 
  full_join(df_full %>%
              filter(vaccine == 'Yes', #censoring because of fluvac
                     Covid_VAC_name == '') %>% 
              distinct(PID, vaccine, Covid_VAC_name, flu_vac_date,date_vaccination) %>% 
              separate(flu_vac_date, into = c('Date', 'Time'), sep = ' ') %>%  #assumed date of vaccination based on received column of the questionnaire
              mutate(flu_vac_date1 = as.Date(Date, format = '%d-%m-%Y')) %>% 
              select(PID, flu_vac_date1, date_vaccination) %>% 
              distinct(PID, flu_vac_date1, .keep_all = T) %>% 
              rename(censor_date = flu_vac_date1) %>% 
              mutate(censor = 4) #fluvac
) %>% 

  arrange(PID, censor_date) %>% 
  mutate(time = as.numeric(as.Date(censor_date, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% 
  mutate(status = 0) 

#Intention to treat means censoring at withdrawal/death only, no censoring=1 for others (that is PP dataset)
df_PID_censor_withflu_ITT <- df_PID_censor_withflu %>% 
  filter(censor %in% c(2,3)) 
  

```

```{r automatic input efficacy, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: efficacy (HS over time)         #
##########################################

#input efficacy

#original event form

hs_time_event0_original <- df_full %>%
  filter(!is.na(event_start)) %>% #only select RTI events which have a non-NA event_start
  select(PID, group, event_number, Onset:Week_5, event_start_all, date_vaccination, final_date, event_status, event_ongoing, event_stop) %>%
  arrange(PID, event_start_all) %>% #to make sure the first start date of an event is first
  mutate(onset_week = round(as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'weeks'))) %>%
  filter(!is.na(Onset)) %>%
  filter(Onset != 'N_A') %>% #coding in case of SARS-CoV-2 antibody test positive and Asymptomatic COVID-19  
  #Onset and Week_1 are for the first week (onset = day 1, Week_1 = day 2-7)
  rowwise() %>% 
  mutate(OnsetWeek1 = as.character(max(as.numeric(as.character(Onset)), as.numeric(as.character(Week_1)), na.rm = T))) %>% 
  ungroup() %>% 
  select(PID, group, event_number, OnsetWeek1, Week_2:Week_5, event_start_all, date_vaccination, final_date, event_status, event_ongoing, event_stop, onset_week) %>% 
  pivot_longer(cols = OnsetWeek1:Week_5, names_to = c('Time0', 'Time00'), names_sep = '_', values_to = 'HS') %>% 
  mutate(Time = ifelse(Time0 == 'OnsetWeek1', onset_week, onset_week + as.numeric(as.character(Time00)) -1)) %>% #get time in weeks since vaccination minus 1 (because onset and week1 are the same week)
  mutate(HS = as.numeric(as.character(HS)),
         Time0 = tolower(Time0), #for merging orignal and fu (which have a capital W difference in week)
         TimeDate = if_else(Time0 == 'onsetweek1',
                           as.Date(event_start_all),
                           as.Date(event_start_all) + 7 * (as.numeric(as.character(Time00))-1))) %>%  #to get the actual date for the event week
  filter(!is.na(HS)) %>% #remove NAs for HS
  distinct()


#transform to integer
column_to_integer <- c("week_1","week_2","week_3","week_4","week_5","week_6","week_7","week_8","week_9","week_10","week_11","week_12")

df_full <- df_full %>% mutate_at(column_to_integer, as.character) %>% mutate_at(column_to_integer, as.numeric)

#follow-up event form
hs_time_event0_fu <- df_full %>%
  filter(!is.na(event_start)) %>% #only select RTI events which have a non-NA event_start
  select(PID, group, event_number, week_1:week_12, event_start_all, date_vaccination, final_date, event_status, event_ongoing, event_stop) %>%
  arrange(PID, event_start_all) %>% #to make sure the first start date of an event is first
  mutate(onset_week = round(as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'weeks'))) %>% 
  filter(!is.na(week_1)) %>%
  pivot_longer(cols = week_1:week_12, names_to = c('Time0', 'Time00'), names_sep = '_', values_to = 'HS') %>% 
  mutate(Time = ifelse(Time0 == 'Onset', onset_week, onset_week + as.numeric(as.character(Time00)) - 1)) %>% #get time in weeks since vaccination minus 1 (because onset and week1 are the same week)
  mutate(HS = as.numeric(as.character(HS)),
         Time0 = tolower(Time0), #for merging orignal and fu (which have a capital W difference in week)
         TimeDate = if_else(Time0 == 'onsetweek1',
                           as.Date(event_start_all),
                           as.Date(event_start_all) + 7 * (as.numeric(as.character(Time00))-1)))  #to get the actual date for the event week

hs_time_event0 <- rbind(hs_time_event0_original, hs_time_event0_fu) %>% 
  distinct(PID, group, event_number, event_start_all, date_vaccination, final_date, Time, HS, event_status, event_ongoing, event_stop, TimeDate) %>% #remove overlapping records from original and follow-up
  filter(!is.na(HS)) %>% #remove NAs for HS
  filter(!(is.na(HS) & Time > as.numeric(as.Date(final_date) - as.Date(date_vaccination), unit = 'weeks'))) #remove records after final date unless HS is not NA (so a specifically recorded HS)

part_hs_time_event <- hs_time_event0 %>% distinct(PID) %>% arrange(PID) #those subjects with an event

#create dataset with 0 values from start to end per event number, then anti-join with the actual event data 

hs_time_event_pre <- hs_time_event0 %>% 
  arrange(PID, event_number, event_start_all) %>%  #arrange on event_start also to get the first start date in case of multiple records per event number
  distinct(PID, event_number, .keep_all = T) %>%
  mutate(PID_eventnumber = paste(PID, event_number, sep = '_')) %>% arrange(PID_eventnumber)

hs_time_event_pre_zero <- data.frame() # create dataset to which the rows will be bound per participant (number of rows differ per participant which makes it challenging)

for(i in 1:length(hs_time_event_pre$PID_eventnumber)){ #~75 seconds
  
  week_seq <- seq(0, round(as.numeric(as.Date(ifelse(is.na(hs_time_event_pre$final_date[i]), date_cut_off, hs_time_event_pre$final_date[i]), origin = "1970-01-01") - as.Date(hs_time_event_pre$date_vaccination[i]), unit = 'weeks'))) #week sequence from start to final date or cut-off date
  
  hs_time_event_pre_zero <- hs_time_event_pre[rep(i, length(week_seq)),] %>% #repeat the first record with the number of added records necessary from 0 to the length of the weekly sequence
    mutate(HS = 0) %>% # and health status 0
    mutate(Time = week_seq) %>% # assign the weekly sequence
    rbind(hs_time_event_pre_zero) %>%
    arrange(PID_eventnumber)
  hs_time_event_pre_zero
}

hs_time_event_pre_zero <- hs_time_event_pre_zero %>% select(!PID_eventnumber)
hs_time_event_per_zero_antijoin <- anti_join(hs_time_event_pre_zero, hs_time_event0, by = c('PID', 'group', 'event_number', 'Time')) %>% 
  mutate(TimeDate = as.Date(date_vaccination) + Time * 7)

hs_time_event <- rbind(hs_time_event0, hs_time_event_per_zero_antijoin) %>%
  arrange(PID, event_number, Time) %>% 
  ungroup() %>% 
  group_by(PID) %>%
  mutate(exclude = ifelse(TimeDate > final_date, #exclude those with a TimeDate after the final date
                          ifelse(TimeDate > event_stop,  #exclude those with a TimeDate after the event's stop date
                                 ifelse(HS == 0 & lag(HS == 0),  #exclude those that have a HS=0 while the previous HS=0
                                        1, 0), 0), 0)) %>% 
  ungroup() %>% 
  filter(exclude != 1 ) %>% 
  select(-exclude) 

#for the participant without event, create dataset with HS=0 from vaccination (week = 0) to cut off date or final date, whichever comes first

hs_time_noevent0 <- df_full %>%
  filter(!(PID %in% part_hs_time_event$PID)) %>% 
  select(PID, group, event_number, event_start_all, date_vaccination, final_date) %>%
  distinct(PID, .keep_all = T) %>%
  mutate(event_number = NA, event_start_all = NA) %>% #remove data from df_full for the no RTI events
  # mutate(onset_week = NA) %>% 
  mutate(event_status = NA, 
         event_ongoing = NA, 
         event_stop = NA) %>% 
  # mutate(Time0 = NA) %>% 
  mutate(HS = 0)  %>% 
  mutate(Time = NA)

hs_time_noevent_zero <- data.frame() # create dataset to which the rows will be bound per participant (number of rows differ per participant which makes it challenging)

for(i in 1:length(hs_time_noevent0$PID)){
  
  week_seq <- 0 : as.numeric(as.Date(min(date_cut_off, hs_time_noevent0$final_date[i], na.rm=T), format = '%Y-%m-%d') - as.Date(hs_time_noevent0$date_vaccination[i], format = '%Y-%m-%d'), unit = 'weeks') # to count with weekly resolution from origin to cut off date or final date, whichever comes first
  hs_time_noevent_zero <- hs_time_noevent0[rep(i, length(week_seq)),] %>% #repeat the first record with the number of added records necessary from 0 to the length of the weekly sequence
    mutate(Time = week_seq) %>% # assign the weekly sequence
    rbind(hs_time_noevent_zero)
  hs_time_noevent_zero
}

hs_time_noevent_zero <- hs_time_noevent_zero %>% 
    mutate(TimeDate = as.Date(date_vaccination) + Time * 7)


hs_time <- rbind(hs_time_event, hs_time_noevent_zero) %>%
  arrange(PID, event_number, Time) %>% 
  #remove instances where a HS=7 is followed by a lower HS (not possible)
  group_by(PID) %>% 
  filter(!(HS != 7 & lag(HS) == 7 & Time != 0)) %>% 
  ungroup() %>% 
  group_by(PID) %>% 
  mutate(exclude = ifelse(TimeDate > final_date, #exclude those with a TimeDate after the final date
                          ifelse(TimeDate > event_stop,  #exclude those with a TimeDate after the event's stop date
                                 ifelse(HS == 0 & lag(HS == 0),  #exclude those that have a HS=0 while the previous HS=0
                                        1, 0), 0), 0)) %>% 
  ungroup() %>% 
  filter(exclude != 1 ) %>% 
  select(-exclude) 

#plot per HS

hs_time_week <- hs_time %>% 
  filter(!is.na(HS)) %>%
  mutate(Time_week = trunc(Time)) %>%
  distinct(PID, event_number, HS, Time_week, .keep_all = T) %>% #remove duplicate rows
  mutate(PID_eventno = paste(PID, event_number, sep = '_'))
hs_time_atrisk <- hs_time_week %>% 
  group_by(group, Time_week) %>% 
  summarise(at_risk = length(unique(PID_eventno))) 

hs_time_0 <- hs_time_week %>% 
  filter(HS == 0) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(PID_eventno))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 0) 
hs_time_0_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 0) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 0) 
hs_time_0_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 0) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 0) 

hs_time_1 <- hs_time_week %>% 
  filter(HS == 1) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 1) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 1) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 1)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 1) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 1))
hs_time_1_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 1) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 1) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 1) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 1)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 1) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 1))
hs_time_1_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 1) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 1) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 1) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 1)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 1) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 1))

hs_time_2 <- hs_time_week %>% 
  filter(HS == 2) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 2) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 2) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 2)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 2) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 2))
hs_time_2_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 2) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(unique(paste(PID, event_number))))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 2) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 2) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 2)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 2) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 2))
hs_time_2_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 2) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 2) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 2) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 2)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 2) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 2))

hs_time_3 <- hs_time_week %>% 
  filter(HS == 3) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 3) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 3) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 3)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 3) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 3))
hs_time_3_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 3) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 3) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 3) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 3)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 3) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 3))
hs_time_3_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 3) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 3) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 3) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 3)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 3) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 3))

hs_time_4 <- hs_time_week %>% 
  filter(HS == 4) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 4) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 4) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 4)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 4) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 4))
hs_time_4_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 4) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 4) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 4) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 4)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 4) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 4))
hs_time_4_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 4) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 4) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 4) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 4)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 4) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 4))

hs_time_5 <- hs_time_week %>% 
  filter(HS == 5) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 5) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 5) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 5)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 5) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 5))
hs_time_5_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 5) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 5) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 5) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 5)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 5) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 5))
hs_time_5_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 5) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 5) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 5) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 5)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 5) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 5))

hs_time_6 <- hs_time_week %>% 
  filter(HS == 6) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 6) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 6) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 6)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 6) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 6))
hs_time_6_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 6) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 6) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 6) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 6)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 6) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 6))
hs_time_6_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 6) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 6) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 6) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 6)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 6) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 6))

hs_time_7 <- hs_time_week %>% 
  filter(HS == 7) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 7) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(HS == 7) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 7)) %>% 
  rbind(hs_time_week %>% 
          filter(HS == 7) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 7))
hs_time_7_group1 <- hs_time_week %>% 
  filter(group == Arm1) %>%
  filter(HS == 7) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 7) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 7) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 7)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm1) %>%
          filter(HS == 7) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 7))
hs_time_7_group2 <- hs_time_week %>% 
  filter(group == Arm2) %>%
  filter(HS == 7) %>%
  group_by(group, Time_week) %>% 
  summarise(event = length(unique(paste(PID, event_number)))) %>%
  left_join(hs_time_atrisk) %>%
  mutate(proportion = event/at_risk,
         HS = 7) %>%
  #get a HS=0 point 1 week before the first event for plotting and at t=0, keep HS for filtering/facetting
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 7) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = Time_week - 1, event = 0, at_risk = NA, proportion = 0, HS = 7)) %>% 
  rbind(hs_time_week %>% 
          filter(group == Arm2) %>%
          filter(HS == 7) %>%
          group_by(group, Time_week) %>% 
          summarise(event = length(unique(paste(PID, event_number)))) %>%
          left_join(hs_time_atrisk) %>%
          head(1) %>% 
          mutate(Time_week = ifelse(Time_week == 0, NA, 0), event = 0, at_risk = NA, proportion = 0, HS = 7))

if(CLOSED == F){
  hs_time_0to7 <- rbind(hs_time_0,
                      hs_time_1,
                      hs_time_2,
                      hs_time_3,
                      hs_time_4,
                      hs_time_5,
                      hs_time_6,
                      hs_time_7)
} else {
  hs_time_0to7 <-  rbind( #for closed report, bind the group1 and group2 
                        hs_time_0_group1,
                        hs_time_0_group2,
                        hs_time_1_group1,
                        hs_time_1_group2,
                        hs_time_2_group1,
                        hs_time_2_group2,
                        hs_time_3_group1,
                        hs_time_3_group2,
                        hs_time_4_group1,
                        hs_time_4_group2,
                        hs_time_5_group1,
                        hs_time_5_group2,
                        hs_time_6_group1,
                        hs_time_6_group2,
                        hs_time_7_group1,
                        hs_time_7_group2)
}

#get transitions between different values

hs_time_transition <- hs_time %>%
  filter(!is.na(HS)) %>% 
  group_by(PID, event_number) %>% 
  mutate(HS_prev = lag(HS),
         HS_diff = HS - HS_prev) %>%
  #remove first records 
  # filter(!(Time0 == 'Onset' & Time == 0)) %>% 
  filter(!(Time == 0)) %>% 
  filter(!is.na(event_number)) %>%  #PIDs without events
  #remove no transition
  filter(HS_diff != 0) 

#from 0
hs_time_transition_01 <- hs_time_transition %>% 
  filter(HS == 1 & HS_diff == 1)
hs_time_transition_02 <- hs_time_transition %>% 
  filter(HS == 2 & HS_diff == 2)
hs_time_transition_03 <- hs_time_transition %>% 
  filter(HS == 3 & HS_diff == 3)
hs_time_transition_04 <- hs_time_transition %>% 
  filter(HS == 4 & HS_diff == 4)
hs_time_transition_05 <- hs_time_transition %>% 
  filter(HS == 5 & HS_diff == 5)
hs_time_transition_06 <- hs_time_transition %>% 
  filter(HS == 6 & HS_diff == 6)
hs_time_transition_07 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 7)
#from 1
hs_time_transition_10 <- hs_time_transition %>% 
  filter(HS == 0 & HS_diff == -1)
hs_time_transition_12 <- hs_time_transition %>% 
  filter(HS == 2 & HS_diff == 1)
hs_time_transition_13 <- hs_time_transition %>% 
  filter(HS == 3 & HS_diff == 2)
hs_time_transition_14 <- hs_time_transition %>% 
  filter(HS == 4 & HS_diff == 3)
hs_time_transition_15 <- hs_time_transition %>% 
  filter(HS == 5 & HS_diff == 4)
hs_time_transition_16 <- hs_time_transition %>% 
  filter(HS == 6 & HS_diff == 5)
hs_time_transition_17 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 6)
#from 2
hs_time_transition_20 <- hs_time_transition %>% 
  filter(HS == 0 & HS_diff == -2)
hs_time_transition_21 <- hs_time_transition %>% 
  filter(HS == 1 & HS_diff == -1)
hs_time_transition_23 <- hs_time_transition %>% 
  filter(HS == 3 & HS_diff == 1)
hs_time_transition_24 <- hs_time_transition %>% 
  filter(HS == 4 & HS_diff == 2)
hs_time_transition_25 <- hs_time_transition %>% 
  filter(HS == 5 & HS_diff == 3)
hs_time_transition_26 <- hs_time_transition %>% 
  filter(HS == 6 & HS_diff == 4)
hs_time_transition_27 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 5)
#from 3
hs_time_transition_30 <- hs_time_transition %>% 
  filter(HS == 0 & HS_diff == -3)
hs_time_transition_31 <- hs_time_transition %>% 
  filter(HS == 1 & HS_diff == -2)
hs_time_transition_32 <- hs_time_transition %>% 
  filter(HS == 2 & HS_diff == -1)
hs_time_transition_34 <- hs_time_transition %>% 
  filter(HS == 4 & HS_diff == 1)
hs_time_transition_35 <- hs_time_transition %>% 
  filter(HS == 5 & HS_diff == 2)
hs_time_transition_36 <- hs_time_transition %>% 
  filter(HS == 6 & HS_diff == 3)
hs_time_transition_37 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 4)
#from 4
hs_time_transition_40 <- hs_time_transition %>% 
  filter(HS == 0 & HS_diff == -4)
hs_time_transition_41 <- hs_time_transition %>% 
  filter(HS == 1 & HS_diff == -3)
hs_time_transition_42 <- hs_time_transition %>% 
  filter(HS == 2 & HS_diff == -2)
hs_time_transition_43 <- hs_time_transition %>% 
  filter(HS == 3 & HS_diff == -1)
hs_time_transition_45 <- hs_time_transition %>% 
  filter(HS == 5 & HS_diff == 1)
hs_time_transition_46 <- hs_time_transition %>% 
  filter(HS == 6 & HS_diff == 2)
hs_time_transition_47 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 3)
#from 5
hs_time_transition_50 <- hs_time_transition %>% 
  filter(HS == 0 & HS_diff == -5)
hs_time_transition_51 <- hs_time_transition %>% 
  filter(HS == 1 & HS_diff == -4)
hs_time_transition_52 <- hs_time_transition %>% 
  filter(HS == 2 & HS_diff == -3)
hs_time_transition_53 <- hs_time_transition %>% 
  filter(HS == 3 & HS_diff == -2)
hs_time_transition_54 <- hs_time_transition %>% 
  filter(HS == 4 & HS_diff == -1)
hs_time_transition_56 <- hs_time_transition %>% 
  filter(HS == 6 & HS_diff == 1)
hs_time_transition_57 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 2)
#from 6
hs_time_transition_60 <- hs_time_transition %>% 
  filter(HS == 0 & HS_diff == -6)
hs_time_transition_61 <- hs_time_transition %>% 
  filter(HS == 1 & HS_diff == -5)
hs_time_transition_62 <- hs_time_transition %>% 
  filter(HS == 2 & HS_diff == -4)
hs_time_transition_63 <- hs_time_transition %>% 
  filter(HS == 3 & HS_diff == -3)
hs_time_transition_64 <- hs_time_transition %>% 
  filter(HS == 4 & HS_diff == -2)
hs_time_transition_65 <- hs_time_transition %>% 
  filter(HS == 5 & HS_diff == -1)
hs_time_transition_67 <- hs_time_transition %>% 
  filter(HS == 7 & HS_diff == 1)

```

```{r automatic input sae, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: safety (SAEs)                   #
##########################################

#Serious adverse events

df_sae0 <- df2_full %>% filter(event_sae == 'Yes' | event_sae_2 == 'Yes') %>% 
  distinct()
df_sae0_rti <- df_sae0 %>% 
  filter(!is.na(event_hs)) %>% 
  select(PID, event_number, report_date, event_sae, event_start, aware_date, event_name, sae_criteria, death_cause, event_description, event_treatment, event_investigation, event_hs, event_causality, event_status, event_ongoing, event_stop, event_HS)
df_sae0_other <- df_sae0 %>% 
  filter(!is.na(event_hs_2)) %>% 
  select(PID, event_number, report_date = report_date_2, event_sae = event_sae_2, event_start = event_start_2, aware_date = aware_date_2, event_name = event_name_2, sae_criteria = sae_criteria_2, death_cause = death_cause_2, event_description = event_description_2, event_treatment = event_treatment_2, event_investigation = event_investigation_2, event_hs = event_hs_2, event_causality = event_causality_2, event_status = event_status_2, event_ongoing = event_ongoing_2, event_stop = event_stop_2, event_HS) #get both SAE datasets with the same header to combine (e.g. report_date and report_date_2)

df_sae <- rbind(df_sae0_rti, df_sae0_other) %>%
  arrange(as.character(event_start)) %>% 
  mutate(PID2 = PID) %>% 
  separate(PID2, into = c('ID0', 'ID'), sep = 'BCG', remove = F) %>% 
  mutate(ID = as.numeric(as.character(ID)),
         site = ifelse(ID <= 450, 'Central', ifelse(ID < 5000, 'Eden', ifelse(ID <= 5500, 'UCT', NA)))) %>% 
  select(!c(PID2, ID0, ID))

```

```{r serology dataset, eval=eval_dsmb, echo=echo_dsmb, message=FALSE}
##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: SARS-COV-2 serology             #
##########################################

df_sero <- df_full %>% 
  filter(!is.na(visit_week_id_3)) %>% 
  distinct(PID, visit_date_3, sars_cov_2, date_vaccination) %>% 
  
  #serology data 
  
  mutate(SERO = ifelse(sars_cov_2 == "POS", 1, 
                       ifelse(sars_cov_2 == "NEG", 0, 
                              ifelse(sars_cov_2 == "EQU", 0,  #assumed 0 
                                     NA)))) %>% 
  #get ID number from PID
  
  separate(PID, into = c("temp", "ID"), sep = "BCG") %>% 
  mutate(ID = as.numeric(as.character(ID))) %>% 
  
  #get time in weeks to join the Markov model dataset 
  
  mutate(TIME = round(as.numeric(as.Date(visit_date_3) - as.Date(date_vaccination), unit = "weeks"))) %>% 
  
  select(ID, TIME, SERO) 

```

```{r automatic input safety, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: safety (all)                    #
##########################################

## health status definitions:
# event_hs / event_hs_1 / event_hs_2 record the highest health score (HS) for an event (unique event is identified by event_number, which should not overlap -> discrepancies will be discussed)
# for respiratory tract infections (RTIs), event_hs_fu also records the highest HS for the follow-up 
# for injection site reactions (ISRs) and other events, event_hs_fu_1 also records the highest HS for the follow-up 
# the highest of these two constitutes the highest HS per event and will be reported to the DSMB -> recorded in event_HS variable.
## health status over time definitions:
# HS over time is reported for RTI only in health_status (original event form, Onset:Week_5, filled in once) and follow-up (event_rti_fu, follow-up RTI form, week_1:week_12, filled in with a weekly frequency)
# The tables are complementary, e.g. Week_1 in health_status should be the same as week_1 in event_rti_fu. 
# The tables will be merged for reporting HS over time (discrepancies will be discussed)

#group assign df2_full 

df2_full <- df2_full %>% 
  left_join(PID_group, by = 'PID')

#most common adverse events

df_meddra <- df2_full %>% 
  select(LLT:SOC_code) %>% 
  distinct(LLT_code, .keep_all = T) 

df_ae <- df2_full %>% 
  distinct(PID, event_number, LLT, LLT_code, event_id, group) %>% 
  group_by(event_id, LLT_code, group) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  arrange(desc(n)) %>% 
  left_join(df_meddra)
df_ae$LLT <- factor(df_ae$LLT, levels = unique(df_ae$LLT))

df_ae_grade34 <- df2_full %>% 
  filter(event_grade >= 3) %>% 
  distinct(PID, event_number, LLT, LLT_code, event_id, group) %>% 
  group_by(event_id, LLT_code, group) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  arrange(desc(n)) %>% 
  left_join(df_meddra)
df_ae_grade34$LLT <- factor(df_ae_grade34$LLT, levels = unique(df_ae_grade34$LLT))

ae_viridis <- viridis(n = 3, end = 0.75, option = 'inferno')

#input safety summary

n_ae <- df2_full %>% #the full event dataset including PID
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number) %>% #select unique event number per PID
  nrow()

part_ae <- df2_full %>% #the full event dataset including PID
  distinct(PID) %>% #filter unique PIDs (keeps only first record!)
  nrow()

n_ae_icu <- df2_full %>% #the full event dataset including PID
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, event_HS) %>% #select unique event number per PID and HS to filter on
  filter(event_HS == 6) %>% #select HS = 6 which is ICU
  nrow()

n_ae_icu_group1 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm1) %>% 
  filter(event_HS == 6) %>% #select HS = 6 which is ICU
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, event_HS) %>% #select unique event number per PID and HS to filter on
  nrow()
n_ae_icu_group2 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm2) %>% 
  filter(event_HS == 6) %>% #select HS = 6 which is ICU
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, event_HS) %>% #select unique event number per PID and HS to filter on
  nrow()

part_ae_icu <- df2_full %>% #the full event dataset including PID
  arrange(desc(event_HS)) %>% #select highest event HS
  filter(event_HS == 6) %>% #select HS = 6 which is ICU
  distinct(PID) %>% #select unique event number per PID and HS to filter on
  nrow()

part_ae_icu_group1 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm1) %>% 
  arrange(desc(event_HS)) %>% #select highest event HS
  filter(event_HS == 6) %>% #select HS = 6 which is ICU
  distinct(PID) %>% #select unique event number per PID and HS to filter on
  nrow()
part_ae_icu_group2 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm2) %>% 
  arrange(desc(event_HS)) %>% #select highest event HS
  filter(event_HS == 6) %>% #select HS = 6 which is ICU
  distinct(PID) %>% #select unique event number per PID and HS to filter on
  nrow()

## RTIs

n_ae_rti <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number) %>% #select unique event number per PID
  nrow()
n_ae_rti_group1 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm1) %>% 
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number) %>% #select unique event number per PID
  nrow()
n_ae_rti_group2 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm2) %>% 
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number) %>% #select unique event number per PID
  nrow()

PID_ae_rti <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, group)
PID_ae_rti_nozero <- df2_full %>% #the full event dataset including PID
  filter(event_hs != 0) %>% # remove HS=zero
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, group)

part_ae_rti <- PID_ae_rti  %>% #filter unique PIDs (keeps only first record!)
  nrow()
part_ae_rti_group1 <- PID_ae_rti  %>% #filter unique PIDs (keeps only first record!)
  filter(group == Arm1) %>% 
  nrow()
part_ae_rti_group2 <- PID_ae_rti  %>% #filter unique PIDs (keeps only first record!)
  filter(group == Arm2) %>% 
  nrow()

ae_rti_healthy <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 0)  #filter for healthy (HS=0) 
n_ae_rti_healthy  <- nrow(ae_rti_healthy)
part_ae_rti_healthy  <-  df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 0) %>%
  distinct(PID) %>% #unique participants
  nrow()

ae_rti_mild <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 1)  #filter for mild (HS=1) 
n_ae_rti_mild  <- nrow(ae_rti_mild)
part_ae_rti_mild <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 1) %>%  #filter for mild (HS=1) 
  distinct(PID) %>% #unique participants
  nrow()
  
ae_rti_moderate <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 2)  #filter for moderate (HS=2) 
n_ae_rti_moderate <- nrow(ae_rti_moderate)
part_ae_rti_moderate <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 2) %>% #filter for moderate (HS=2) 
  distinct(PID) %>% #unique participants
  nrow()
  
ae_rti_severe <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 3)  #filter for severe (HS=3) 
n_ae_rti_severe <- nrow(ae_rti_severe)
part_ae_rti_severe <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 3) %>% #filter for severe (HS=3)  <- 
  distinct(PID) %>% #unique participants
  nrow()

ae_rti_hosp <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 4) #filter for hospitalized (HS=4) 
n_ae_rti_hosp <- nrow(ae_rti_hosp )
part_ae_rti_hosp <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 4) %>% #filter for hospitalized (HS=4)  <- 
  distinct(PID) %>% #unique participants
  nrow()

ae_rti_hosp_oxy <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 5) #filter for hospitalized with oxygen (HS=5) 
n_ae_rti_hosp_oxy <- nrow(ae_rti_hosp_oxy)
part_ae_rti_hosp_oxy <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 5) %>% #filter for hospitalized with oxygen (HS=5) 
  distinct(PID) %>% #unique participants
  nrow()

ae_rti_hosp_vent <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 6) #filter for hospitalized with ventilation (HS=6) 
n_ae_rti_hosp_vent <- nrow(ae_rti_hosp_vent)
part_ae_rti_hosp_vent <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 6) %>% #filter for hospitalized with ventilation (HS=6) 
  distinct(PID) %>% #unique participants
  nrow()

ae_rti_dead <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 7) %>%  #filter for hospitalized with ventilation (HS=7) 
  distinct(PID, .keep_all = T) #for fatal; only report the distinct PIDs (not the number of events which can be double recorded with different descriptions)
n_ae_rti_dead <- nrow(ae_rti_dead)
part_ae_rti_dead <- df2_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(!is.na(event_hs)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 7) %>% #filter for hospitalized with ventilation (HS=7) 
  distinct(PID) %>% #unique participants
  nrow()

## ISR

n_ae_isr <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  nrow()
n_ae_isr <- df2_full %>% #the full event dataset including PID
  filter(group == Arm2) %>% 
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  nrow()

part_ae_isr <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  distinct(PID) %>% #filter unique PIDs (keeps only first record!)
  nrow()

ae_isr_healthy <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 0)  #filter for healthy (HS=0) 
n_ae_isr_healthy  <- nrow(ae_isr_healthy)
part_ae_isr_healthy <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 0) %>% #filter for healthy (HS=0) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_mild <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 1)  #filter for mild (HS=1) 
n_ae_isr_mild  <- nrow(ae_isr_mild)
part_ae_isr_mild <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 1) %>%  #filter for mild (HS=1) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_moderate <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 2)  #filter for moderate (HS=2) 
n_ae_isr_moderate <- nrow(ae_isr_moderate)
part_ae_isr_moderate <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 2) %>%  #filter for moderate (HS=2) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_severe <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 3)  #filter for severe (HS=3) 
n_ae_isr_severe <- nrow(ae_isr_severe)
part_ae_isr_severe <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 3) %>% #filter for severe (HS=3) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_hosp <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 4) #filter for hospitalized (HS=4) 
n_ae_isr_hosp <- nrow(ae_isr_hosp )
part_ae_isr_hosp <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 4) %>% #filter for hospitalized (HS=4) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_hosp_oxy <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 5) #filter for hospitalized with oxygen (HS=5) 
n_ae_isr_hosp_oxy <- nrow(ae_isr_hosp_oxy)
part_ae_isr_hosp_oxy <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 5) %>% #filter for hospitalized with oxygen (HS=5) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_hosp_vent <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 6) #filter for hospitalized with ventilation (HS=6) 
n_ae_isr_hosp_vent <- nrow(ae_isr_hosp_vent)
part_ae_isr_hosp_vent <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 6) %>% #filter for hospitalized with ventilation (HS=6) 
  distinct(PID) %>% #unique participants
  nrow()

ae_isr_dead <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 7) %>%  #filter for hospitalized with ventilation (HS=7) 
  distinct(PID, .keep_all = T) #for fatal; only report the distinct PIDs (not the number of events which can be double recorded with different descriptions)
n_ae_isr_dead <- nrow(ae_isr_dead)
part_ae_isr_dead <- df2_full %>% #the full event dataset including PID
  filter(report_date_1 != '') %>% #report_date_1 is the input column from the event_isr dataset
  filter(!is.na(event_hs_1)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 7) %>% #filter for hospitalized with ventilation (HS=7) 
  distinct(PID) %>% #unique participants
  nrow()

## Other

n_ae_other <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  nrow()
n_ae_other_group1 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm1) %>% 
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  nrow()
n_ae_other_group2 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm2) %>% 
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  nrow()

part_ae_other <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  distinct(PID) %>% #filter unique PIDs (keeps only first record!)
  nrow()
part_ae_other_group1 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm1) %>% 
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  distinct(PID) %>% #filter unique PIDs (keeps only first record!)
  nrow()
part_ae_other_group2 <- df2_full %>% #the full event dataset including PID
  filter(group == Arm2) %>% 
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  distinct(PID) %>% #filter unique PIDs (keeps only first record!)
  nrow()

ae_other_healthy <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 0)  #filter for healthy (HS=0) 
n_ae_other_healthy  <- nrow(ae_other_healthy)
part_ae_other_healthy <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 0) %>% #filter for healthy (HS=0) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_mild <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 1)  #filter for mild (HS=1) 
n_ae_other_mild  <- nrow(ae_other_mild)
part_ae_other_mild <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 1) %>% #filter for mild (HS=1) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_moderate <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 2)  #filter for moderate (HS=2) 
n_ae_other_moderate <- nrow(ae_other_moderate)
part_ae_other_moderate <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 2) %>% #filter for moderate (HS=2) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_severe <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 3)  #filter for severe (HS=3) 
n_ae_other_severe <- nrow(ae_other_severe)
part_ae_other_severe <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 3) %>%  #filter for severe (HS=3) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_hosp <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 4) #filter for hospitalized (HS=4) 
n_ae_other_hosp <- nrow(ae_other_hosp )
part_ae_other_hosp <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 4) %>% #filter for hospitalized (HS=4) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_hosp_oxy <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 5) #filter for hospitalized with oxygen (HS=5) 
n_ae_other_hosp_oxy <- nrow(ae_other_hosp_oxy)
part_ae_other_hosp_oxy <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 5) %>% #filter for hospitalized with oxygen (HS=5) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_hosp_vent <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 6) #filter for hospitalized with ventilation (HS=6) 
n_ae_other_hosp_vent <- nrow(ae_other_hosp_vent)
part_ae_other_hosp_vent <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 6) %>% #filter for hospitalized with ventilation (HS=6) 
  distinct(PID) %>% #unique participants
  nrow()

ae_other_dead <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 7) %>% #filter for hospitalized with ventilation (HS=7) 
  distinct(PID, .keep_all = T) #for fatal; only report the distinct PIDs (not the number of events which can be double recorded with different descriptions)
n_ae_other_dead <- nrow(ae_other_dead)
part_ae_other_dead <- df2_full %>% #the full event dataset including PID
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other dataset
  filter(!is.na(event_hs_2)) %>% #remove NA health status
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% #select unique event number per PID
  filter(event_HS == 7) %>% #filter for hospitalized with ventilation (HS=7) 
  distinct(PID) %>% #unique participants
  nrow()

#serious adverse events

sae_rti <- df2_full %>%
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(event_sae == 'Yes') %>% 
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) #select unique event number per PID
n_sae_rti <- nrow(sae_rti)

part_sae_rti <- df2_full %>%
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  filter(event_sae == 'Yes') %>%
  distinct(PID)  #filter unique PIDs (keeps only first record!)

sae_other <- df2_full %>%
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other datasets
  filter(event_sae_2 == 'Yes') %>% 
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) #select unique event number per PID
n_sae_other <- nrow(sae_other)

part_sae_other <- df2_full %>%
  filter(report_date_2 != '') %>% #report_date_2 is the input column from the event_other datasets
  filter(event_sae_2 == 'Yes') %>%
  distinct(PID)  #filter unique PIDs (keeps only first record!)

sae <- full_join(sae_rti, sae_other)
n_sae <- n_sae_rti + n_sae_other #total serouse adverse events is sum of RTI and ISR
n_sae_group1 <- sae %>% 
  filter(group == Arm1) %>% 
  nrow()
n_sae_group2 <- sae %>% 
  filter(group == Arm2) %>% 
  nrow()
part_sae <- length(unique(c(unlist(part_sae_rti), unlist(part_sae_other))))
part_sae_group1 <- sae %>% 
  filter(group == Arm1) %>%  
  distinct(PID) %>% 
  nrow()
part_sae_group2 <- sae %>% 
  filter(group == Arm2) %>%  
  distinct(PID) %>% 
  nrow()

n_sae_unrelated <- sum(
  sae$event_causality == 'Unrelated', 
  sae$event_causality_1 == 'Unrelated', 
  sae$event_causality_2 == 'Unrelated', 
  sae$event_causality == 'Unrrelated', 
  sae$event_causality_1 == 'Unrrelated', 
  sae$event_causality_2 == 'Unrrelated',   
  na.rm = T
)
n_sae_unlikelyrelated <- sum(
  sae$event_causality == 'Unlikely_related', 
  sae$event_causality_1 == 'Unlikely_related', 
  sae$event_causality_2 == 'Unlikely_related', 
  na.rm = T
)
n_sae_possiblyrelated <- sum(
  sae$event_causality == 'Possibly_related', 
  sae$event_causality_1 == 'Possibly_related', 
  sae$event_causality_2 == 'Possibly_related', 
  na.rm = T
)
n_sae_probablyrelated <- sum(
  sae$event_causality == 'Probably_related', 
  sae$event_causality_1 == 'Probably_related', 
  sae$event_causality_2 == 'Probably_related', 
  na.rm = T
)
n_sae_definitely <- sum(
  sae$event_causality == 'Definitely', 
  sae$event_causality_1 == 'Definitely', 
  sae$event_causality_2 == 'Definitely', 
  na.rm = T
)

n_sae_unrelated_group1 <- sum(
  sae$event_causality[sae$group == Arm1] == 'Unrelated', 
  sae$event_causality_1[sae$group == Arm1] == 'Unrelated', 
  sae$event_causality_2[sae$group == Arm1] == 'Unrelated', 
  sae$event_causality[sae$group == Arm1] == 'Unrrelated', 
  sae$event_causality_1[sae$group == Arm1] == 'Unrrelated', 
  sae$event_causality_2[sae$group == Arm1] == 'Unrrelated',   
  na.rm = T
)
n_sae_unlikelyrelated_group1 <- sum(
  sae$event_causality[sae$group == Arm1] == 'Unlikely_related', 
  sae$event_causality_1[sae$group == Arm1] == 'Unlikely_related', 
  sae$event_causality_2[sae$group == Arm1] == 'Unlikely_related', 
  na.rm = T
)
n_sae_possiblyrelated_group1 <- sum(
  sae$event_causality[sae$group == Arm1] == 'Possibly_related', 
  sae$event_causality_1[sae$group == Arm1] == 'Possibly_related', 
  sae$event_causality_2[sae$group == Arm1] == 'Possibly_related', 
  na.rm = T
)
n_sae_probablyrelated_group1 <- sum(
  sae$event_causality[sae$group == Arm1] == 'Probably_related', 
  sae$event_causality_1[sae$group == Arm1] == 'Probably_related', 
  sae$event_causality_2[sae$group == Arm1] == 'Probably_related', 
  na.rm = T
)
n_sae_definitely_group1 <- sum(
  sae$event_causality[sae$group == Arm1] == 'Definitely', 
  sae$event_causality_1[sae$group == Arm1] == 'Definitely', 
  sae$event_causality_2[sae$group == Arm1] == 'Definitely', 
  na.rm = T
)

n_sae_unrelated_group2 <- sum(
  sae$event_causality[sae$group == Arm2] == 'Unrelated', 
  sae$event_causality_1[sae$group == Arm2] == 'Unrelated', 
  sae$event_causality_2[sae$group == Arm2] == 'Unrelated', 
  sae$event_causality[sae$group == Arm2] == 'Unrrelated', 
  sae$event_causality_1[sae$group == Arm2] == 'Unrrelated', 
  sae$event_causality_2[sae$group == Arm2] == 'Unrrelated',   
  na.rm = T
)
n_sae_unlikelyrelated_group2 <- sum(
  sae$event_causality[sae$group == Arm2] == 'Unlikely_related', 
  sae$event_causality_1[sae$group == Arm2] == 'Unlikely_related', 
  sae$event_causality_2[sae$group == Arm2] == 'Unlikely_related', 
  na.rm = T
)
n_sae_possiblyrelated_group2 <- sum(
  sae$event_causality[sae$group == Arm2] == 'Possibly_related', 
  sae$event_causality_1[sae$group == Arm2] == 'Possibly_related', 
  sae$event_causality_2[sae$group == Arm2] == 'Possibly_related', 
  na.rm = T
)
n_sae_probablyrelated_group2 <- sum(
  sae$event_causality[sae$group == Arm2] == 'Probably_related', 
  sae$event_causality_1[sae$group == Arm2] == 'Probably_related', 
  sae$event_causality_2[sae$group == Arm2] == 'Probably_related', 
  na.rm = T
)
n_sae_definitely_group2 <- sum(
  sae$event_causality[sae$group == Arm2] == 'Definitely', 
  sae$event_causality_1[sae$group == Arm2] == 'Definitely', 
  sae$event_causality_2[sae$group == Arm2] == 'Definitely', 
  na.rm = T
)

#mortatality, hospitalization

ae_dead <- ae_rti_dead %>%
  full_join(ae_isr_dead) %>%
  full_join(ae_other_dead) %>% 
  distinct(PID, .keep_all = T)


n_deaths <- ae_dead %>%
  distinct(PID, .keep_all = T) %>% 
  nrow()
cause_deaths <- ae_dead %>%
  distinct(PID, .keep_all = T) %>% 
  select(LLT) %>% unlist()
n_deaths_group1 <- ae_dead %>%
  filter(group == Arm1) %>%
  distinct(PID, .keep_all = T) %>% 
  nrow()
n_deaths_group2 <- ae_dead %>%
  filter(group == Arm2) %>%
  distinct(PID, .keep_all = T) %>%
  nrow()

details_deaths <- df_full %>% 
  mutate(PID_eventnumber = paste(PID, event_number)) %>% 
  filter(PID_eventnumber %in% unique(paste(ae_dead$PID, ae_dead$event_number))) %>% 
  distinct(PID, .keep_all = T) %>%  
  select(PID, LLT, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, site)

hosp <- ae_rti_hosp %>%
  full_join(ae_rti_hosp_oxy) %>%
  full_join(ae_rti_hosp_vent) %>%
  full_join(ae_rti_dead) %>%
  full_join(ae_isr_hosp) %>%
  full_join(ae_isr_hosp_oxy) %>%
  full_join(ae_isr_hosp_vent) %>%
  full_join(ae_isr_dead) %>%
  full_join(ae_other_hosp) %>%
  full_join(ae_other_hosp_oxy) %>%
  full_join(ae_other_hosp_vent) %>%
  full_join(ae_other_dead)

n_hosp <- hosp %>%
  distinct(PID, .keep_all = T) %>%  #select unique PID (keeps only first record!)
  nrow()
n_hosp_group1 <- hosp %>%
  filter(group == Arm1) %>% 
  distinct(PID, .keep_all = T) %>%  #select unique PID (keeps only first record!)
  nrow()
n_hosp_group2 <- hosp %>%
  filter(group == Arm2) %>%
  distinct(PID, .keep_all = T) %>%  #select unique PID (keeps only first record!)
  nrow()

# adverse events by grade

df_ae_grade234 <- df_full %>% 
  filter(event_grade >=2) %>% 
  distinct(PID, event_number, group) 
df_ae_grade3 <- df_full %>% 
  filter(event_grade == 3) %>% 
  distinct(PID, event_number, group) 
df_ae_grade4 <- df_full %>% 
  filter(event_grade == 4) %>% 
  distinct(PID, event_number, group) 

df_ae_grade234_ISR <- df_full %>% 
  filter(event_grade >=2, event_id == 'ISR') %>% 
  distinct(PID, event_number, group) 
n_ae_grade_234_ISR <- df_ae_grade234_ISR %>% 
  nrow()
part_ae_grade_234_ISR <- df_ae_grade234_ISR %>% 
  distinct(PID) %>% 
  nrow()

n_ae_grade_234_ISR_group1 <- df_ae_grade234_ISR %>% 
  filter(group == Arm1) %>% 
  nrow()
n_ae_grade_234_ISR_group2 <- df_ae_grade234_ISR %>% 
  filter(group == Arm2) %>% 
  nrow()
part_ae_grade_234_ISR_group1 <- df_ae_grade234_ISR %>% 
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_234_ISR_group2 <- df_ae_grade234_ISR %>% 
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()

n_ae_grade_234 <- df_ae_grade234 %>% 
  nrow()
n_ae_grade_3 <- df_ae_grade3 %>% 
  nrow()
n_ae_grade_4 <- df_ae_grade4 %>% 
  nrow()
part_ae_grade_234 <- df_ae_grade234 %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_3 <- df_ae_grade3 %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_4 <- df_ae_grade4 %>% 
  distinct(PID) %>% 
  nrow()

n_ae_grade_234_group1 <- df_ae_grade234 %>% 
  filter(group == Arm1) %>% 
  nrow()
n_ae_grade_3_group1 <- df_ae_grade3 %>% 
  filter(group == Arm1) %>% 
  nrow()
n_ae_grade_4_group1 <- df_ae_grade4 %>% 
  filter(group == Arm1) %>% 
  nrow()
n_ae_grade_234_group2 <- df_ae_grade234 %>% 
  filter(group == Arm2) %>% 
  nrow()
n_ae_grade_3_group2 <- df_ae_grade3 %>% 
  filter(group == Arm2) %>% 
  nrow()
n_ae_grade_4_group2 <- df_ae_grade4 %>% 
  filter(group == Arm2) %>% 
  nrow()
part_ae_grade_234_group1 <- df_ae_grade234 %>% 
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_3_group1 <- df_ae_grade3 %>% 
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_4_group1 <- df_ae_grade4 %>% 
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_234_group2 <- df_ae_grade234 %>% 
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_3_group2 <- df_ae_grade3 %>% 
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()
part_ae_grade_4_group2 <- df_ae_grade4 %>% 
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()


```

```{r automatic input safety km datasets, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: safety (Kaplan Meier datasets)  #
##########################################

# Kaplan Meier (KM) dataset total hospitalization

hosp_km_signal <- df_full %>%
  filter(PID %in% unique(hosp$PID)) %>% #select hospitalized PIDs
  filter(event_HS >= 4 ) %>% #select hospitalization event per hospitalized PID
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hosp_km_nosignal <- df_full %>%
  filter(!(PID %in% unique(hosp$PID))) %>%
  distinct(PID, .keep_all = T) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hosp_km <- rbind(hosp_km_signal, hosp_km_nosignal)

#with censoring

hosp_km_censor <- hosp_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hosp_km_censor_novac <- hosp_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

# KM dataset HS == 0 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs0_PID <- ae_rti_healthy$PID %>% unique()
hs0_PID_eventnumber <- ae_rti_healthy %>% distinct(PID, event_number) %>% ungroup()

hs0_km_signal <- hs0_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs0_km_nosignal <- df_full %>%
  filter(!(PID %in% hs0_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs0_km <- rbind(hs0_km_signal, hs0_km_nosignal)

#with censoring

hs0_km_censor <- hs0_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs0_km_censor_novac <- hs0_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

# KM dataset HS == 1 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs1_PID <- unique(ae_rti_mild$PID)
hs1_PID_eventnumber <- ae_rti_mild %>% distinct(PID, event_number) %>% ungroup()

hs1_km_signal <- hs1_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs1_km_nosignal <- df_full %>%
  filter(!(PID %in% hs1_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs1_km <- rbind(hs1_km_signal, hs1_km_nosignal)

#with censoring

hs1_km_censor <- hs1_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs1_km_censor_novac <- hs1_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()


# KM dataset HS == 2 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs2_PID <- unique(ae_rti_moderate$PID)
hs2_PID_eventnumber <- ae_rti_moderate %>% distinct(PID, event_number) %>% ungroup()

hs2_km_signal <- hs2_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs2_km_nosignal <- df_full %>%
  filter(!(PID %in% hs2_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs2_km <- rbind(hs2_km_signal, hs2_km_nosignal)

#with censoring

hs2_km_censor <- hs2_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs2_km_censor_novac <- hs2_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

# KM dataset HS == 3 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs3_PID <- unique(ae_rti_severe$PID)
hs3_PID_eventnumber <- ae_rti_severe %>% distinct(PID, event_number) %>% ungroup()

hs3_km_signal <- hs3_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs3_km_nosignal <- df_full %>%
  filter(!(PID %in% hs3_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs3_km <- rbind(hs3_km_signal, hs3_km_nosignal)

#with censoring

hs3_km_censor <- hs3_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs3_km_censor_novac <- hs3_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()


# KM dataset HS == 4 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs4_PID <- unique(ae_rti_hosp$PID)
hs4_PID_eventnumber <- ae_rti_hosp %>% distinct(PID, event_number) %>% ungroup()

hs4_km_signal <- hs4_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs4_km_nosignal <- df_full %>%
  filter(!(PID %in% hs4_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs4_km <- rbind(hs4_km_signal, hs4_km_nosignal)

#with censoring

hs4_km_censor <- hs4_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs4_km_censor_novac <- hs4_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

# KM dataset HS == 5 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs5_PID <- unique(ae_rti_hosp_oxy$PID)
hs5_PID_eventnumber <- ae_rti_hosp_oxy %>% distinct(PID, event_number) %>% ungroup()

hs5_km_signal <- hs5_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs5_km_nosignal <- df_full %>%
  filter(!(PID %in% hs5_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs5_km <- rbind(hs5_km_signal, hs5_km_nosignal)

#with censoring

hs5_km_censor <- hs5_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs5_km_censor_novac <- hs5_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

# KM dataset HS == 6 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs6_PID <- unique(ae_rti_hosp_vent$PID)
hs6_PID_eventnumber <- ae_rti_hosp_vent %>% distinct(PID, event_number) %>% ungroup()
 
hs6_km_signal <- hs6_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs6_km_nosignal <- df_full %>%
  filter(!(PID %in% hs6_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs6_km <- rbind(hs6_km_signal, hs6_km_nosignal)

#with censoring

hs6_km_censor <- hs6_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs6_km_censor_novac <- hs6_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

# KM dataset HS == 7 for RTI events (not all events -> see secondary endpoints, and no ISR modelling)

hs7_PID <- unique(ae_rti_dead$PID)
hs7_PID_eventnumber <- ae_rti_dead %>% distinct(PID, event_number) %>% ungroup()

hs7_km_signal <- hs7_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs7_km_nosignal <- df_full %>%
  filter(!(PID %in% hs7_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs7_km <- rbind(hs7_km_signal, hs7_km_nosignal)

#with censoring

hs7_km_censor <- hs7_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs7_km_censor_novac <- hs7_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()


# KM dataset HS > 0 for RTI only, first event (signal dataset)

hs_rti_PID <- PID_ae_rti_nozero$PID

hs_rti_km_signal <- df_full %>%
  filter(PID %in% hs_rti_PID) %>%
  filter(event_HS != 0) %>% #remove HS=0 from dataset, only HS>0
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date, expect_interact, site) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hs_rti_km_nosignal <- df_full %>%
  filter(!(PID %in% hs_rti_PID)) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date, expect_interact, site) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hs_rti_km <- rbind(hs_rti_km_signal, hs_rti_km_nosignal)

#with censoring

hs_rti_km_censor <- hs_rti_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

hs_rti_km_censor_novac <- hs_rti_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()


```

```{r automatic input C19, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: COVID-19                        #
##########################################

#COVID19

## A COVID-19 diagnosis (meaning PCR confirmed with symptoms) should be interpreted as a COVID-19 event. This means the LLT = COVID-19 is leading. c19_positive (self reported positive test) can still be used to double check if an event record is created for a self-reported COVID-19, but event is leading. Serology is reported separately.


# 1. self_reported positive test

c19_confirm <- df_full %>%
  filter(c19_positive == 'Yes') %>% 
  distinct(PID, .keep_all = T) #for positive records, select only unique PIDs (keeps only first record!)
n_c19_confirm <- c19_confirm %>% nrow()

c19_confirm_group1 <- df_full %>%
  filter(group == Arm1) %>%
  filter(c19_positive == 'Yes') %>% 
  distinct(PID, .keep_all = T) #for positive records, select only unique PIDs (keeps only first record!)
n_c19_confirm_group1 <- c19_confirm_group1 %>% nrow()

c19_confirm_group2 <- df_full %>%
  filter(group == Arm2) %>%
  filter(c19_positive == 'Yes') %>% 
  distinct(PID, .keep_all = T) #for positive records, select only unique PIDs (keeps only first record!)
n_c19_confirm_group2 <- c19_confirm_group2 %>% nrow()

# 2. RTI event form

c19_event_no <- df_full %>%
  # filter(event_name %in% unlist(spelling) | LLT == 'COVID-19') %>%  # We only use LLT to select COVID-19 to ensure it is the symptomatic disease, not just a positive test result
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  arrange(desc(event_HS)) %>% #keep highest HS
  distinct(PID, event_number, .keep_all = T) %>%  #for positive records, select only unique PID and event number (in case of re-infection) 
  select(PID, event_number, event_HS) %>% 
  filter(event_HS != 0) %>% ungroup() # remove the HS=0 rows
n_c19_event <- c19_event_no %>% nrow()
c19_event <- df_full %>% 
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  filter(event_HS != 0) %>% #prevent asymptomatic infections
  right_join(c19_event_no, by = c('PID', 'event_number', 'event_HS')) %>% 
  distinct(PID, event_number, LLT, event_HS, .keep_all = T) #keep worsening C19 records as well (same event_number, different event_hs)

c19_event_group1 <- c19_event %>% 
  filter(group == Arm1) 
n_c19_event_group1 <- c19_event_group1 %>% nrow()

c19_event_group2 <- c19_event %>% 
  filter(group == Arm2)
n_c19_event_group2 <- c19_event_group2 %>% nrow()

# 3. Serology

#number of data points
part_c19_serology <- df_full %>%
  filter(!is.na(visit_week_id_3)) %>%
  distinct(PID) %>%
  nrow()
part_c19_serology_group1 <- df_full %>%
  filter(!is.na(visit_week_id_3)) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_c19_serology_group2 <- df_full %>%
  filter(!is.na(visit_week_id_3)) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()
part_c19_serology_week0 <- df_full %>%
  filter(visit_week_id_3 == 0) %>%
  distinct(PID) %>%
  nrow()
part_c19_serology_week0_group1 <- df_full %>%
  filter(visit_week_id_3 == 0) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week0_group2 <- df_full %>%
  filter(visit_week_id_3 == 0) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week10 <- df_full %>%
  filter(visit_week_id_3 == 10) %>%
  distinct(PID) %>%
  nrow()
part_c19_serology_week10_group1 <- df_full %>%
  filter(visit_week_id_3 == 10) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week10_group2 <- df_full %>%
  filter(visit_week_id_3 == 10) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week26 <- df_full %>%
  filter(visit_week_id_3 == 26) %>%
  distinct(PID) %>%
  nrow()
part_c19_serology_week26_group1 <- df_full %>%
  filter(visit_week_id_3 == 26) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week26_group2 <- df_full %>%
  filter(visit_week_id_3 == 26) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week52 <- df_full %>%
  filter(visit_week_id_3 == 52) %>%
  distinct(PID) %>%
  nrow()
part_c19_serology_week52_group1 <- df_full %>%
  filter(visit_week_id_3 == 52) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_week52_group2 <- df_full %>%
  filter(visit_week_id_3 == 52) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_unknown <- df_full %>%
  filter(visit_week_id_3 == -99) %>%
  distinct(PID) %>%
  nrow()
part_c19_serology_unknown_group1 <- df_full %>%
  filter(visit_week_id_3 == -99) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_c19_serology_unknown_group2 <- df_full %>%
  filter(visit_week_id_3 == -99) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()
part_c19_secondsample <- rbind(df_full %>%
  filter(visit_week_id_3 == 0) %>%
  distinct(PID), 
  df_full %>%
  filter(visit_week_id_3 == 10) %>%
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == 26) %>%
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == 52) %>%
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == -99) %>%
  distinct(PID)) %>%
  group_by(PID) %>%
  summarise(samples = n()) %>%
  filter(samples > 1) %>%
  nrow()
part_c19_secondsample_group1 <- rbind(df_full %>%
  filter(visit_week_id_3 == 0) %>%
  filter(group == Arm1) %>% 
  distinct(PID), 
  df_full %>%
  filter(visit_week_id_3 == 10) %>%
  filter(group == Arm1) %>% 
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == 26) %>%
  filter(group == Arm1) %>% 
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == 52) %>%
  filter(group == Arm1) %>% 
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == -99) %>%
  filter(group == Arm1) %>% 
  distinct(PID)) %>%
  group_by(PID) %>%
  summarise(samples = n()) %>%
  filter(samples > 1) %>%
  nrow()
part_c19_secondsample_group2 <- rbind(df_full %>%
  filter(visit_week_id_3 == 0) %>%
  filter(group == Arm2) %>% 
  distinct(PID), 
  df_full %>%
  filter(visit_week_id_3 == 10) %>%
  filter(group == Arm2) %>% 
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == 26) %>%
  filter(group == Arm2) %>% 
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == 52) %>%
  filter(group == Arm2) %>% 
  distinct(PID),
  df_full %>%
  filter(visit_week_id_3 == -99) %>%
  filter(group == Arm2) %>% 
  distinct(PID)) %>%
  group_by(PID) %>%
  summarise(samples = n()) %>%
  filter(samples > 1) %>%
  nrow()

c19_serology <- df_full %>%
  filter(sars_cov_2 == 'POS') %>% 
  distinct(PID, visit_week_id_3, .keep_all = T) #for positive records, select only unique PID and dates
n_c19_serology <- c19_serology %>% distinct(PID) %>% nrow()

c19_serology_group1 <- df_full %>%
  filter(group == Arm1) %>%
  filter(sars_cov_2 == 'POS') %>%
  distinct(PID, visit_week_id_3, .keep_all = T) #for positive records, select only unique PID and dates
n_c19_serology_group1 <- c19_serology_group1 %>% distinct(PID) %>% nrow()

c19_serology_group2 <- df_full %>%
  filter(group == Arm2) %>%
  filter(sars_cov_2 == 'POS') %>% 
  distinct(PID, visit_week_id_3, .keep_all = T) #for positive records, select only unique PID and dates
n_c19_serology_group2 <- c19_serology_group2 %>% distinct(PID) %>% nrow()

c19_serology_nosym <- setdiff(c19_serology$PID, c19_event$PID) #PIDs of participants with positive serology but no event (symptomless cases)
n_c19_serology_nosym <- c19_serology_nosym %>% unique() %>% length()

c19_serology_nosym_group1 <- setdiff(c19_serology_group1$PID, c19_event_group1$PID) #PIDs of participants in group 1 with positive serology but no event (symptomless cases)
n_c19_serology_nosym_group1 <- c19_serology_nosym_group1 %>% unique() %>% length()

c19_serology_nosym_group2 <- setdiff(c19_serology_group2$PID, c19_event_group2$PID) #PIDs of participants in group 2 with positive serology but no event (symptomless cases)
n_c19_serology_nosym_group2 <- c19_serology_nosym_group2 %>% unique() %>% length()

##serology at timepoints
c19_serology_week0 <- c19_serology %>%
  select(PID, group, date_vaccination, sars_cov_2, visit_week_id_3) %>%
  filter(visit_week_id_3 == 0)
n_c19_serology_week0 <- c19_serology_week0 %>% distinct(PID) %>% nrow()
n_c19_serology_week0_group1 <- c19_serology_week0 %>% filter(group == Arm1) %>% distinct(PID) %>% nrow()
n_c19_serology_week0_group2 <- c19_serology_week0 %>% filter(group == Arm2) %>% distinct(PID) %>% nrow()

c19_serology_week10 <- c19_serology %>%
  select(PID, group, date_vaccination, sars_cov_2, visit_week_id_3) %>%
  filter(visit_week_id_3 == 10)
n_c19_serology_week10 <- c19_serology_week10 %>% distinct(PID) %>% nrow()
n_c19_serology_week10_group1 <- c19_serology_week10 %>% filter(group == Arm1) %>% distinct(PID) %>% nrow()
n_c19_serology_week10_group2 <- c19_serology_week10 %>% filter(group == Arm2) %>% distinct(PID) %>% nrow()

c19_serology_week26 <- c19_serology %>%
  select(PID, group, date_vaccination, sars_cov_2, visit_week_id_3) %>%
  filter(visit_week_id_3 == 26)
n_c19_serology_week26 <- c19_serology_week26 %>% distinct(PID) %>% nrow()
n_c19_serology_week26_group1 <- c19_serology_week26 %>% filter(group == Arm1) %>% distinct(PID) %>% nrow()
n_c19_serology_week26_group2 <- c19_serology_week26 %>% filter(group == Arm2) %>% distinct(PID) %>% nrow()

c19_serology_week52 <- c19_serology %>%
  select(PID, group, date_vaccination, sars_cov_2, visit_week_id_3) %>%
  filter(visit_week_id_3 == 52)
n_c19_serology_week52 <- c19_serology_week52 %>% distinct(PID) %>% nrow()
n_c19_serology_week52_group1 <- c19_serology_week52 %>% filter(group == Arm1) %>% distinct(PID) %>% nrow()
n_c19_serology_week52_group2 <- c19_serology_week52 %>% filter(group == Arm2) %>% distinct(PID) %>% nrow()


df_c19_serology <- data.frame(TIME = rep(c(0, 10, 26, 52)), 
                              n = c(n_c19_serology_week0,
                                    n_c19_serology_week10,
                                    n_c19_serology_week26,
                                    n_c19_serology_week52),
                              part = c(part_c19_serology_week0,
                                    part_c19_serology_week10,
                                    part_c19_serology_week26,
                                    part_c19_serology_week52)) %>% 
  mutate(rel = n/part) 


df_c19_serology_group <- data.frame(TIME = rep(c(0, 10, 26, 52), each = 2), 
                              group = rep(c(Arm1, Arm2), 4),
                              n = c(n_c19_serology_week0_group1,
                                    n_c19_serology_week0_group2,
                                    n_c19_serology_week10_group1,
                                    n_c19_serology_week10_group2,
                                    n_c19_serology_week26_group1,
                                    n_c19_serology_week26_group2,
                                    n_c19_serology_week52_group1,
                                    n_c19_serology_week52_group2),
                              part = c(part_c19_serology_week0_group1,
                                    part_c19_serology_week0_group2,
                                    part_c19_serology_week10_group1,
                                    part_c19_serology_week10_group2,
                                    part_c19_serology_week26_group1,
                                    part_c19_serology_week26_group2,
                                    part_c19_serology_week52_group1,
                                    part_c19_serology_week52_group2)) %>% 
  mutate(rel = n/part) 
##serology at baseline

c19_serology_baseline <- c19_serology %>%
  filter(visit_week_id_3 == 0) %>% 
  distinct(PID, .keep_all = T) #for positive records, select only unique PIDs (keeps only first record!)
n_c19_serology_baseline <- c19_serology_baseline %>% nrow()

c19_serology_baseline_group1 <- c19_serology_group1 %>%
  filter(visit_week_id_3 == 0) %>% 
  distinct(PID, .keep_all = T) #for positive records, select only unique PIDs (keeps only first record!)
n_c19_serology_baseline_group1 <- c19_serology_baseline_group1 %>% nrow()

c19_serology_baseline_group2 <- c19_serology_group2 %>%
  filter(visit_week_id_3 == 0) %>% 
  distinct(PID, .keep_all = T) #for positive records, select only unique PIDs (keeps only first record!)
n_c19_serology_baseline_group2 <- c19_serology_baseline_group2 %>% nrow()

##seroconversion (POSitive or NEGative, EQUivoal or N_R non reactive)
c19_seroconversion_pos <- df3_full %>% 
  select(PID, visit_date_3, sars_cov_2) %>% 
  filter(sars_cov_2 != 'EQU', sars_cov_2 != 'N_R') %>%
  arrange(PID, visit_date_3) %>% 
  group_by(PID) %>% 
  filter(sars_cov_2 == 'POS', lag(sars_cov_2) == 'NEG') %>% 
  rename(first_positive = visit_date_3) %>% 
  select(!sars_cov_2)

c19_seroconversion_neg <- df3_full %>% 
  select(PID, visit_date_3, sars_cov_2) %>% 
  filter(sars_cov_2 != 'EQU', sars_cov_2 != 'N_R') %>% 
  arrange(PID, visit_date_3) %>% 
  group_by(PID) %>% 
  filter(sars_cov_2 == 'NEG', lead(sars_cov_2) == 'POS') %>% 
  rename(last_negative = visit_date_3) %>% 
  select(!sars_cov_2)

c19_seroconversion <- c19_seroconversion_neg %>% 
  full_join(c19_seroconversion_pos) %>% 
  ungroup()

#make sure the last negative is before the first positive!
PID_group_vacc <- df_full %>%
  distinct(PID, group, date_vaccination)

c19_seroconversion <- c19_seroconversion %>%
  left_join(PID_group_vacc) 
n_c19_seroconversion <- c19_seroconversion %>% nrow()

c19_seroconversion_group1 <- c19_seroconversion %>%
  filter(group == Arm1)
n_c19_seroconversion_group1 <- c19_seroconversion_group1 %>% nrow()

c19_seroconversion_group2 <- c19_seroconversion %>%
  filter(group == Arm2)
n_c19_seroconversion_group2 <- c19_seroconversion_group2 %>% nrow()

#seroconversion with or without a C19 event

c19_seroconversion_event <- c19_event %>% 
  distinct(PID, event_start) %>% 
  mutate(c19_event = 1) %>% 
  right_join(c19_seroconversion) %>%
  filter(c19_event == 1) %>% 
  filter(event_start > last_negative,
         event_start < first_positive) %>% 
  distinct(PID, .keep_all = T)
n_c19_seroconversion_event <- c19_seroconversion_event %>% nrow()

c19_seroconversion_event_group1 <- c19_seroconversion_event %>% 
  filter(group == Arm1)
n_c19_seroconversion_event_group1 <- c19_seroconversion_event_group1 %>% nrow()
 
c19_seroconversion_event_group2 <- c19_seroconversion_event %>% 
  filter(group == Arm2)
n_c19_seroconversion_event_group2 <- c19_seroconversion_event_group2 %>% nrow()

c19_seroconversion_noevent <- c19_event %>%
  distinct(PID) %>%
  mutate(c19_event = 1) %>%
  right_join(c19_seroconversion) %>%
  filter(is.na(c19_event))
n_c19_seroconversion_noevent <- c19_seroconversion_noevent %>% nrow()

c19_seroconversion_noevent_group1 <- c19_seroconversion_noevent %>%
  filter(group == Arm1)
n_c19_seroconversion_noevent_group1 <- c19_seroconversion_noevent_group1 %>% nrow()

c19_seroconversion_noevent_group2 <- c19_seroconversion_noevent %>%
  filter(group == Arm2)
n_c19_seroconversion_noevent_group2 <- c19_seroconversion_noevent_group2 %>% nrow()

#seroconversion with or without an RTI event

rti_seroconversion_event <- df2_event_rti_PID %>% 
  distinct(PID, event_start) %>% 
  mutate(rti_event = 1) %>% 
  right_join(c19_seroconversion) %>%
  filter(rti_event == 1) %>% 
  filter(event_start > last_negative,
         event_start < first_positive) %>% 
  distinct(PID, .keep_all = T)
n_rti_seroconversion_event <- rti_seroconversion_event %>% nrow()

rti_seroconversion_event_group1 <- rti_seroconversion_event %>% 
  filter(group == Arm1)
n_rti_seroconversion_event_group1 <- rti_seroconversion_event_group1 %>% nrow()
 
rti_seroconversion_event_group2 <- rti_seroconversion_event %>% 
  filter(group == Arm2)
n_rti_seroconversion_event_group2 <- rti_seroconversion_event_group2 %>% nrow()


#C19 or RTI event for baseline seropositive

c19_serobaseline_event <- c19_event %>% 
  distinct(PID, event_start) %>% 
  mutate(c19_event = 1) %>% 
  right_join(c19_serology_week0) %>%
  filter(c19_event == 1)
n_c19_serobaseline_event <- c19_serobaseline_event %>% nrow()

c19_serobaseline_event_group1 <- c19_serobaseline_event %>% 
  filter(group == Arm1)
n_c19_serobaseline_event_group1 <- c19_serobaseline_event_group1 %>% nrow()
 
c19_serobaseline_event_group2 <- c19_serobaseline_event %>% 
  filter(group == Arm2)
n_c19_serobaseline_event_group2 <- c19_serobaseline_event_group2 %>% nrow()


rti_serobaseline_event <- df2_event_rti_PID %>% 
  distinct(PID, event_start) %>% 
  mutate(rti_event = 1) %>% 
  right_join(c19_serology_week0) %>%
  filter(rti_event == 1)
n_rti_serobaseline_event <- rti_serobaseline_event %>% nrow()

rti_serobaseline_event_group1 <- rti_serobaseline_event %>% 
  filter(group == Arm1)
n_rti_serobaseline_event_group1 <- rti_serobaseline_event_group1 %>% nrow()
 
rti_serobaseline_event_group2 <- rti_serobaseline_event %>% 
  filter(group == Arm2)
n_rti_serobaseline_event_group2 <- rti_serobaseline_event_group2 %>% nrow()


## number of participants with COVID 19 diagnosis per arm

c19 <- c19_event %>%
  distinct(PID) 

part_c19 <- c19 %>%
  nrow()

c19_group1 <- c19_event_group1 %>%
  distinct(PID) 

part_c19_group1 <- c19_group1 %>%
  nrow()

c19_group2 <- c19_event_group2 %>%
  distinct(PID) 

part_c19_group2 <- c19_group2 %>%
  nrow()

#repeatede infection
repeated_infec <- df_full %>% 
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  distinct(PID, event_number, LLT) %>%
  distinct(PID, event_number) %>%
  group_by(PID) %>% 
  summarise(n = n()) %>% 
  filter(n > 1)
part_repeated_infect <- repeated_infec%>% nrow()
repeated_infec_group1 <- df_full %>% 
  filter(group == Arm1) %>% 
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  # distinct(PID, event_number, LLT) %>% 
  distinct(PID, event_number) %>% 
  group_by(PID) %>% 
  summarise(n = n()) %>% 
  filter(n > 1)
part_repeated_infect_group1 <- repeated_infec_group1 %>% nrow()
repeated_infec_group2 <- df_full %>% 
  filter(group == Arm2) %>% 
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  # distinct(PID, event_number, LLT) %>% 
  distinct(PID, event_number) %>% 
  group_by(PID) %>% 
  summarise(n = n()) %>% 
  filter(n > 1)
part_repeated_infect_group2 <- repeated_infec_group2 %>% nrow()

# COVID by HS 

c19_HS1 <- c19_event %>%
  filter(event_HS == 1) %>%
  distinct(PID) 

n_c19_HS1 <- c19_HS1 %>%
  nrow()

c19_HS1_group1 <- c19_event_group1 %>%
  filter(event_HS == 1) %>%
  distinct(PID) 

n_c19_HS1_group1 <- c19_HS1_group1 %>%
  nrow()

c19_HS1_group2 <- c19_event_group2 %>%
  filter(event_HS == 1) %>%
  distinct(PID) 

n_c19_HS1_group2 <- c19_HS1_group2 %>%
  nrow()


c19_HS2 <- c19_event %>%
  filter(event_HS == 2) %>%
  distinct(PID)

n_c19_HS2 <- c19_HS2 %>%
  nrow()

c19_HS2_group1 <- c19_event_group1 %>%
  filter(event_HS == 2) %>%
  distinct(PID) 

n_c19_HS2_group1 <- c19_HS2_group1%>%
  nrow()

c19_HS2_group2 <- c19_event_group2 %>%
  filter(event_HS == 2) %>%
  distinct(PID) 

n_c19_HS2_group2 <- c19_HS2_group2 %>%
  nrow()


c19_HS3 <- c19_event %>%
  filter(event_HS == 3) %>%
  distinct(PID) 

n_c19_HS3 <- c19_HS3%>%
  nrow()

c19_HS3_group1 <- c19_event_group1 %>%
  filter(event_HS == 3) %>%
  distinct(PID) 

n_c19_HS3_group1 <- c19_HS3_group1%>%
  nrow()

c19_HS3_group2 <- c19_event_group2 %>%
  filter(event_HS == 3) %>%
  distinct(PID) 

n_c19_HS3_group2 <- c19_HS3_group2%>%
  nrow()

c19_HS4 <- c19_event %>%
  filter(event_HS == 4) %>%
  distinct(PID) 

n_c19_HS4 <- c19_HS4%>%
  nrow()

c19_HS4_group1 <- c19_event_group1 %>%
  filter(event_HS == 4) %>%
  distinct(PID) 

n_c19_HS4_group1 <- c19_HS4_group1%>%
  nrow()

c19_HS4_group2 <- c19_event_group2 %>%
  filter(event_HS == 4) %>%
  distinct(PID) 

n_c19_HS4_group2 <- c19_HS4_group2%>%
  nrow()

c19_HS5 <- c19_event %>%
  filter(event_HS == 5) %>%
  distinct(PID) 

n_c19_HS5 <- c19_HS5%>%
  nrow()

c19_HS5_group1 <- c19_event_group1 %>%
  filter(event_HS == 5) %>%
  distinct(PID) 

n_c19_HS5_group1 <- c19_HS5_group1 %>%
  nrow()

c19_HS5_group2 <- c19_event_group2 %>%
  filter(event_HS == 5) %>% 
  distinct(PID) 

n_c19_HS5_group2 <- c19_HS5_group2 %>%
  nrow()

c19_HS6 <- c19_event %>%
  filter(event_HS == 6) %>%
  distinct(PID) 

n_c19_HS6 <- c19_HS6%>%
  nrow()

c19_HS6_group1 <- c19_event_group1 %>%
  filter(event_HS == 6) %>%
  distinct(PID) 

n_c19_HS6_group1 <- c19_HS6_group1%>%
  nrow()

c19_HS6_group2 <- c19_event_group2 %>%
  filter(event_HS == 6) %>%
  distinct(PID) 

n_c19_HS6_group2 <- c19_HS6_group2%>%
  nrow()

c19_HS7 <- c19_event %>%
  filter(event_HS == 7) %>%
  distinct(PID) 

n_c19_HS7 <- c19_HS7%>%
  nrow()

c19_HS7_group1 <- c19_event_group1 %>%
  filter(event_HS == 7) %>%
  distinct(PID) 

n_c19_HS7_group1 <- c19_HS7_group1%>%
  nrow()

c19_HS7_group2 <- c19_event_group2 %>%
  filter(event_HS == 7) %>%
  distinct(PID) 

n_c19_HS7_group2 <- c19_HS7_group2%>%
  nrow()

PID_sars_cov_2 <- df_full %>%
  distinct(PID, sars_cov_2)
  
hosp_c19 <- hosp %>%
  left_join(PID_sars_cov_2, by = 'PID') %>%
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  distinct(PID, .keep_all = T)  #select unique PID (keeps only first record!)

n_hosp_c19 <- hosp_c19 %>%
  nrow()

hosp_c19_group1 <- hosp %>%
  left_join(PID_sars_cov_2, by = 'PID') %>%
  filter(group == Arm1) %>%
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  distinct(PID, .keep_all = T)  #select unique PID (keeps only first record!)
  
n_hosp_c19_group1 <- hosp_c19_group1 %>%
  nrow()

hosp_c19_group2 <- hosp %>%
  left_join(PID_sars_cov_2, by = 'PID') %>%
  filter(group == Arm2) %>%
  filter(LLT == 'COVID-19' | LLT_code == 10084268 | PT_code == 10084268 | PT_code == 10084380) %>% #use both LLT and PT_code for COVID-19 (pneumonia)
  distinct(PID, .keep_all = T)  #select unique PID (keeps only first record!)

n_hosp_c19_group2 <- hosp_c19_group2 %>%
  nrow()


# KM dataset hospitalization due to COVID-19
hosp_c19_PID_eventnumber <- hosp_c19 %>% distinct(PID, event_number) %>% ungroup()

hosp_c19_km_signal <- hosp_c19_PID_eventnumber %>% 
  left_join(df_full) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date, site, expect_interact) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

hosp_c19_km_nosignal <- df_full %>%
  filter(!(PID %in% unique(hosp_c19$PID))) %>%
  distinct(PID, .keep_all = T) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date, site, expect_interact) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0)

hosp_c19_km <- rbind(hosp_c19_km_signal, hosp_c19_km_nosignal)

#with censoring

hosp_c19_km_censor <- hosp_c19_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

#with censoring on withdrawal and death only, no vaccination (following the SAP procedure)

hosp_c19_km_censor_SAP <- hosp_c19_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

#clarify naming
hosp_c19_km_itt <- hosp_c19_km_censor_SAP #censoring at withdrawn or death
hosp_c19_km_pp <- hosp_c19_km_censor #censoring at withdrawn, death, COVID-19 vaccination

# KM dataset COVID-19

c19_PID_eventnumber <- c19_event %>% distinct(PID, event_number) %>% ungroup()

c19_km_signal <- c19_PID_eventnumber %>% 
  left_join(df_full) %>%
  filter(event_HS > 0 ) %>% #ignore HS=0
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date, event_HS) %>%
  arrange(event_start_all) %>% #sort with oldest events first, to filter time to first event
  distinct(PID, .keep_all = T) %>% #filter time to first event
  mutate(time = as.numeric(as.Date(event_start_all, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>%
  mutate(status = 1)

c19_km_nosignal <- df_full %>%
  filter(!(PID %in% unique(c19_PID_eventnumber$PID))) %>%
  distinct(PID, .keep_all = T) %>%
  select(PID, age, gender, BMI, ethnicity, job_category, medhis_dm, medhis_hyptens, medhis_asthma, medhis_cvd, medhis_copd, medhis_otherlung, medhis_cvd_type, medhis_kd, bcg_scar, pack_years, group, date_vaccination, event_start_all, final_date, event_HS) %>%
  mutate(event_start_all = NA) %>% #no event in this category
  distinct(PID, .keep_all = T) %>% #one record per participant
  group_by(PID) %>% #without grouping on PID, the final_date for the whole dataset is considered in the min() function...
  mutate(time = as.numeric(as.Date(min(date_cut_off, final_date, na.rm=T), format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days')) %>% #time from vaccination to cut-off date or final date, whichever comes first
  ungroup() %>% 
  mutate(status = 0) %>% 
  mutate(event_HS = NA)

c19_km <- rbind(c19_km_signal, c19_km_nosignal)

#with censoring

c19_km_censor <- c19_km %>% 
  left_join(df_PID_censor %>% distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

c19_km_censor_novac <- c19_km %>% 
  left_join(df_PID_censor %>% filter(censor %in% c(2, 3)) %>% #only withdrawal and death as censoring
              distinct(PID, .keep_all = T), by = c('PID')) %>%
  rowwise() %>% 
  mutate(status = ifelse(is.na(status.y), status.x, #if no status.y, keep status.x
                         ifelse(time.x < time.y, status.x, status.y))) %>% #get the status corresponding with the first time point
  ungroup() %>% 
  rowwise() %>% 
  mutate(time = ifelse(is.na(time.y), time.x, #if no time.y, keep time.x
                             min(time.x, time.y, na.rm=T))) %>% #and get the first time point
  ungroup()

```

```{r automatic input TB, eval = eval_dsmb, echo = echo_dsmb, message = F}

##########################################
# AUTOMATIC INPUT OF VARIABLES TO REPORT #
# Focus: Tuberculosis                    #
##########################################

# TB by IGRA

## TB tests are performed at week 0 and week 52.
## If a test is taken, there will be either a 'Positive', 'Intermediate', 'Negative', or 'No result' (with a comment)
## If igra is blank (''), no test is taken. 

#baseline

n_TB_baseline_pos <- df_full %>% filter(visit_week_id_3 == 0, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive
n_TB_baseline_pos_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 0, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive
n_TB_baseline_pos_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 0, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive

n_TB_baseline_int <- df_full %>% filter(visit_week_id_3 == 0, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate
n_TB_baseline_int_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 0, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate
n_TB_baseline_int_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 0, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate

n_TB_baseline_neg <- df_full %>% filter(visit_week_id_3 == 0, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative
n_TB_baseline_neg_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 0, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative
n_TB_baseline_neg_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 0, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative

n_TB_baseline_nr <- df_full %>% filter(visit_week_id_3 == 0, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result
n_TB_baseline_nr_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 0, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result
n_TB_baseline_nr_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 0, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result

#week 52
n_TB_week52_pos <- df_full %>% filter(visit_week_id_3 == 52, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive
n_TB_week52_pos_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 52, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive
n_TB_week52_pos_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 52, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive

n_TB_week52_int <- df_full %>% filter(visit_week_id_3 == 52, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate
n_TB_week52_int_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 52, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate
n_TB_week52_int_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 52, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate

n_TB_week52_neg <- df_full %>% filter(visit_week_id_3 == 52, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative
n_TB_week52_neg_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 52, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative
n_TB_week52_neg_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 52, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative

n_TB_week52_nr <- df_full %>% filter(visit_week_id_3 == 52, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result
n_TB_week52_nr_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == 52, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result
n_TB_week52_nr_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == 52, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result

#unknown
n_TB_unknown_pos <- df_full %>% filter(visit_week_id_3 == -99, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive
n_TB_unknown_pos_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == -99, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive
n_TB_unknown_pos_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == -99, igra == 'POS') %>% distinct(PID) %>% nrow() #amount of TB tests positive

n_TB_unknown_int <- df_full %>% filter(visit_week_id_3 == -99, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate
n_TB_unknown_int_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == -99, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate
n_TB_unknown_int_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == -99, igra == 'IND') %>% distinct(PID) %>% nrow() #amount of TB tests intermediate

n_TB_unknown_neg <- df_full %>% filter(visit_week_id_3 == -99, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative
n_TB_unknown_neg_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == -99, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative
n_TB_unknown_neg_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == -99, igra == 'NEG') %>% distinct(PID) %>% nrow() #amount of TB tests negative

n_TB_unknown_nr <- df_full %>% filter(visit_week_id_3 == -99, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result
n_TB_unknown_nr_group1 <- df_full %>% filter(group == Arm1) %>% filter(visit_week_id_3 == -99, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result
n_TB_unknown_nr_group2 <- df_full %>% filter(group == Arm2) %>% filter(visit_week_id_3 == -99, igra == 'N_R') %>% distinct(PID) %>% nrow() #amount of TB tests no result


comment_TB_nr <- df_full %>% filter(igra == 'N_R') %>% distinct(igra.nr_comment) #unique comments why no result
comment_TB_nr_group1 <- df_full %>% filter(group == Arm1) %>% filter(igra == 'N_R') %>% distinct(igra.nr_comment) #unique comments why no result
comment_TB_nr_group2 <- df_full %>% filter(group == Arm2) %>% filter(igra == 'N_R') %>% distinct(igra.nr_comment) #unique comments why no result

part_TB_week0 <- df_full %>%
  filter(visit_week_id_3 == 0, !is.na(igra)) %>%
  distinct(PID) %>%
  nrow()
part_TB_week0_group1 <- df_full %>%
  filter(visit_week_id_3 == 0, !is.na(igra)) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_TB_week0_group2 <- df_full %>%
  filter(visit_week_id_3 == 0, !is.na(igra)) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()

part_TB_week52 <- df_full %>%
  filter(visit_week_id_3 == 52, !is.na(igra)) %>%
  distinct(PID) %>%
  nrow()
part_TB_week52_group1 <- df_full %>%
  filter(visit_week_id_3 == 52, !is.na(igra)) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_TB_week52_group2 <- df_full %>%
  filter(visit_week_id_3 == 52, !is.na(igra)) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()

part_TB_unknown <- df_full %>%
  filter(visit_week_id_3 == -99, !is.na(igra)) %>%
  distinct(PID, igra) %>%
  nrow()
part_TB_unknown_group1 <- df_full %>%
  filter(visit_week_id_3 == -99, !is.na(igra)) %>%
  filter(group == Arm1) %>% 
  distinct(PID) %>%
  nrow()
part_TB_unknown_group2 <- df_full %>%
  filter(visit_week_id_3 == -99, !is.na(igra)) %>%
  filter(group == Arm2) %>% 
  distinct(PID) %>%
  nrow()


df_TB_seroconversion <- anti_join(df_TB_trial, df_TB_baseline)

df_TB_trial_PID <- df_TB %>% 
  filter(visit_week_id_3 != 52) %>% 
  distinct(PID, TBdate = visit_date_3)

#active TB

spelling_TB <- 'Tuberculosis|tuberculosis|TUBERCULOSIS|tb|TB'

active_tb <- df_full %>%
  filter(grepl(spelling_TB, event_name) | LLT == 'Tuberculosis') %>%
  distinct(PID, event_name, event_number, group, LLT, event_HS) 

n_active_TB <- active_tb %>% 
  distinct(PID, event_number) %>% 
  nrow()
part_active_TB <- active_tb %>% 
  distinct(PID) %>% 
  nrow()

n_active_TB_group1 <- active_tb %>% 
  filter(group == Arm1) %>% 
  distinct(PID, event_number) %>% 
  nrow()
n_active_TB_group2 <- active_tb %>% 
  filter(group == Arm2) %>% 
  distinct(PID, event_number) %>% 
  nrow()
part_active_TB_group1 <- active_tb %>% 
  filter(group == Arm1) %>% 
  distinct(PID) %>% 
  nrow()
part_active_TB_group2 <- active_tb %>% 
  filter(group == Arm2) %>% 
  distinct(PID) %>% 
  nrow()

#igra converted TB

igraconversion_pos <- df3_full %>% 
  select(PID, visit_date_3, igra) %>% 
  filter(igra != 'EQU', igra != 'N_R', igra != '', igra != 'IND') %>% 
  arrange(PID, visit_date_3) %>% 
  group_by(PID) %>% 
  filter(igra == 'POS', lag(igra) == 'NEG') %>% 
  rename(first_positive = visit_date_3) %>% 
  select(!igra)

igraconversion_neg <- df3_full %>% 
  select(PID, visit_date_3, igra) %>% 
  filter(igra != 'EQU', igra != 'N_R', igra != '', igra != 'IND') %>% 
  arrange(PID, visit_date_3) %>% 
  group_by(PID) %>% 
  filter(igra == 'NEG', lead(igra) == 'POS') %>% 
  rename(last_negative = visit_date_3) %>% 
  select(!igra)

igraconversion <- igraconversion_neg %>% 
  full_join(igraconversion_pos) %>% 
  ungroup() %>% 
  left_join(df_risk)

#reverters

revere_igraconversion_pos <- df3_full %>% 
  select(PID, visit_date_3, igra, visit_week_id_3) %>% 
  filter(igra != 'EQU', igra != 'N_R', igra != '', igra != 'IND') %>% 
  arrange(PID, visit_date_3) %>% 
  group_by(PID) %>% 
  filter(igra == 'POS', lead(igra) == 'NEG') %>% 
  rename(last_positive = visit_date_3) %>% 
  select(!igra)

revere_igraconversion_neg <- df3_full %>% 
  select(PID, visit_date_3, igra, visit_week_id_3) %>% 
  filter(igra != 'EQU', igra != 'N_R', igra != '', igra != 'IND') %>% 
  arrange(PID, visit_date_3) %>% 
  group_by(PID) %>% 
  filter(igra == 'NEG', lag(igra) == 'POS') %>% 
  rename(first_negative = visit_date_3) %>% 
  select(!igra)

revere_igraconversion <- revere_igraconversion_neg %>% select(!visit_week_id_3) %>% 
  full_join(revere_igraconversion_pos %>% select(!visit_week_id_3) ) %>% 
  ungroup() %>% 
  left_join(df_risk)

revere_igraconversion_week52 <- revere_igraconversion_neg %>% 
  filter(visit_week_id_3 != -99) %>% #only get week 0 and 52
  select(!visit_week_id_3) %>% #remove to preventn double records
  full_join(revere_igraconversion_pos %>% filter(visit_week_id_3 != -99) %>% select(!visit_week_id_3)) %>% #only get week 0 and 52
  ungroup() %>% 
  left_join(df_risk) %>% 
  #full joining with the filtered datasets will result in a NA for the first negative or last positive in case of week=-99
  filter(!is.na(first_negative )) %>% 
  filter(!is.na(last_positive )) 


```


\newpage  

# Introduction to trial and reported data

### Report status

`r ifelse(CLOSED == TRUE, "This is the closed report with observations per arm, *please close if you are not permitted to read this.*", "This is the open report without observations per arm.")`

### Protocol title

Reducing morbidity and mortality in healthcare and other frontline workers at risk of exposure to SARS-CoV-2 by enhancing non-specific immune responses through Bacillus Calmette-Gu$\'{e}$rin vaccination, a double-blinded, randomized controlled trial.

### Protocol number

TASK-008-BCG-CORONA

### Protocol version

4.0 (date 20 May 2020)
Incorporating amendments 1, 2 and 3

### ClinicalTrials.gov Identifier

[NCT04379336](https://clinicaltrials.gov/ct2/show/study/NCT04379336)

### Principle investigator
 
Dr. Caryn Upton

### Meeting date

`r format(as.Date(date), '%A, %d %B %Y')`

### Data report issued

`r format(Sys.time(), '%A, %d %B %Y')`

### Data cut off date

`r format(as.Date(date_cut_off), '%A, %d %B %Y')`

### Date of last data safety monitoring board (DSMB) meeting

`r format(as.Date(date_last_dsmb), '%A, %d %B %Y')`

### Prepared by

`r author_prep`

\newpage

# Executive summary

| | | 
|--- | --------- |
|**Report overview** | This report is the final trial report with data available in the study database as of `r format(as.Date(date_cut_off), '%A, %d %B %Y')`. Summary tables are provided in the body of the report. Additional tables and figures referenced in the report are provided in the Appendices.|
|||
|**Enrolment status** | `r part_screen` participants screened for this study.|
| | `r part_enrol_noQC ` participants were enrolled.|
| | `r part_enrol` participants were enrolled of which data (quality controlled, QC'ed) is in this report.|
| | `r ifelse(CLOSED == T, paste(part_group1, " participants were assigned to the ", Arm1, " arm, ", part_group2, " participants were assigned to the ", Arm2, " arm."),"")`|
|||
|**Participant status** | `r ifelse(CLOSED == T, paste(part_followup_10_group1, "participants in the ", Arm1, " arm, and", part_followup_10_group2, "participants in the ", Arm2, " arm have completed week 10 serology follow-up."), paste(part_followup_10, "participants have completed week 10 serology follow-up."))` |
| | `r ifelse(CLOSED == T, paste(part_followup_26_group1, "participants in the ", Arm1, " arm, and", part_followup_26_group2, "participants in the ", Arm2, " arm have completed week 26 serology follow-up."), paste(part_followup_26, "participants have completed week 26 serology follow-up."))`|
| | `r ifelse(CLOSED == T, paste(part_followup_52_group1, "participants in the ", Arm1, " arm, and", part_followup_52_group2, "participants in the ", Arm2, " arm have completed week 52 serology follow-up."), paste(part_followup_52, "participants have completed week 52 serology follow-up."))`|
| | `r ifelse(CLOSED == T, paste(part_discont, " participants have been discontinued (withdrawn), ", part_discont_group1, " in the ", Arm1, " arm and ", part_discont_group2, " in the ", Arm2, " arm.", sep = ""), paste(part_discont, " participants have been discontinued (withdrawn).", sep = ""))`|
| | `r ifelse(CLOSED == T, paste("Of those, ", part_discont_losttofu, " participants were lost to follow-up, ", part_discont_losttofu_group1, " in the ", Arm1, " arm and ", part_discont_losttofu_group2, " in the ", Arm2, " arm, and ", part_discont - part_discont_losttofu, " participants lost interest, ", part_discont_group1 - part_discont_losttofu_group1, " in the ", Arm1, " arm and ", part_discont_group2 - part_discont_losttofu_group2, " in the ", Arm2, " arm.", sep = ""), paste("Of those, ", part_discont_losttofu, " participants were lost to follow-up, and ", part_discont - part_discont_losttofu, " participants lost interest.", sep = ""))`|
|||
|**Demographics and risk factors**  | `r ifelse(CLOSED == T, paste("Median age (interquartile range [IQR]) was ", sum_age_med_group1, " (", sum_age_lwr_group1, "-", sum_age_upr_group1, ") years in the ", Arm1, " arm, and ", sum_age_med_group2, " (", sum_age_lwr_group2, "-", sum_age_upr_group2, ") years in the ", Arm2, " arm.", sep = ''), paste("Median age (interquartile range [IQR]) was ", sum_age_med, " (", sum_age_lwr, "-", sum_age_upr, ") years.", sep = ''))`|
||`r ifelse(CLOSED == T, paste("Median BMI (IQR) was ", sum_BMI_med_group1, " (", sum_BMI_lwr_group1, "-", sum_BMI_upr_group1, ") kg/m$^2$ in the ", Arm1, " arm, and ", sum_BMI_med_group2, " (", sum_BMI_lwr_group2, "-", sum_BMI_upr_group2, ") kg/m$^2$ in the ", Arm2, " arm.", sep = ''), paste("Median BMI (IQR) was ", sum_BMI_med, " (", sum_BMI_lwr, "-", sum_BMI_upr, ") kg/m$^2$.", sep = ''))`|
||`r ifelse(CLOSED == T, paste("Gender distribution was ", nrow(df_risk[df_risk$gender == 'Male' & df_risk$group == Arm1,]), " males and ", nrow(df_risk[df_risk$gender == 'Female' & df_risk$group == Arm1,]), " females in the ", Arm1, " arm, and ", nrow(df_risk[df_risk$gender == 'Male' & df_risk$group == Arm2,]), " males and ", nrow(df_risk[df_risk$gender == 'Female' & df_risk$group == Arm2,]), " females in the ", Arm2, " arm.", sep = ''), paste("Gender distribution was ", nrow(df_risk[df_risk$gender == 'Male',]), " males and ", nrow(df_risk[df_risk$gender == 'Female',]), " females.", sep = ''))`|
||`r ifelse(CLOSED == T, paste("Ethnicity distribution was ", nrow(df_risk[df_risk$ethnicity == 'African' & df_risk$group == Arm1,]), " African, ", nrow(df_risk[df_risk$ethnicity == 'Caucasian' & df_risk$group == Arm1,]), " Caucasian, ", nrow(df_risk[df_risk$ethnicity == 'Coloured' & df_risk$group == Arm1,]), " Coloured, ", nrow(df_risk[df_risk$ethnicity == 'Indian' & df_risk$group == Arm1,]), " Indian, and ", nrow(df_risk[df_risk$ethnicity == 'Other' & df_risk$group == Arm1,]), " Other in the ", Arm1, " arm, and ", nrow(df_risk[df_risk$ethnicity == 'African' & df_risk$group == Arm2,]), " African, ", nrow(df_risk[df_risk$ethnicity == 'Caucasian' & df_risk$group == Arm2,]), " Caucasian, ", nrow(df_risk[df_risk$ethnicity == 'Coloured' & df_risk$group == Arm2,]), " Coloured, ", nrow(df_risk[df_risk$ethnicity == 'Indian' & df_risk$group == Arm2,]), " Indian, and ", nrow(df_risk[df_risk$ethnicity == 'Other' & df_risk$group == Arm2,]), " Other in the ", Arm2, " arm.", sep = ''), paste("Ethnicity distribution was ", nrow(df_risk[df_risk$ethnicity == 'African',]), " African, ", nrow(df_risk[df_risk$ethnicity == 'Caucasian',]), " Caucasian, ", nrow(df_risk[df_risk$ethnicity == 'Coloured',]), " Coloured, ", nrow(df_risk[df_risk$ethnicity == 'Indian',]), " Indian, and ", nrow(df_risk[df_risk$ethnicity == 'Other',]), " Other.", sep = ''))`|
||`r ifelse(CLOSED == T, paste("Job category distribution was ", nrow(df_risk[df_risk$job_category == 'Doctor' & df_risk$group == Arm1,]), " Doctors, ", nrow(df_risk[df_risk$job_category == 'Nurse' & df_risk$group == Arm1,]), " Nurses, and ", nrow(df_risk[df_risk$job_category == 'Essential_workers' & df_risk$group == Arm1,]) , " Essential workers in the ", Arm1, " arm, and ", nrow(df_risk[df_risk$job_category == 'Doctor' & df_risk$group == Arm2,]), " Doctors, ", nrow(df_risk[df_risk$job_category == 'Nurse' & df_risk$group == Arm2,]), " Nurses, and ", nrow(df_risk[df_risk$job_category == 'Essential_workers' & df_risk$group == Arm2,]) , " Essential workers in the ", Arm2, " arm.", sep = ''), paste("Job category distribution was ", nrow(df_risk[df_risk$job_category == 'Doctor',]), " Doctors, ", nrow(df_risk[df_risk$job_category == 'Nurse',]), " Nurses, and ", nrow(df_risk[df_risk$job_category == 'Essential_workers',]) , " Essential workers.", sep = ''))`|
||`r ifelse(CLOSED == T, paste("There were ", nrow(df_risk[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1,]), " (", round(100*nrow(df_risk[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1,])/part_group1),"%) current smokers (mean pack years: ", round(mean(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm1]), 1),", IQR: ", sum_smoke_lwr_group1, "-", sum_smoke_upr_group1, ") in the ", Arm1, " arm, and ", nrow(df_risk[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2,]), " (", round(100*nrow(df_risk[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2,])/part_group2),"%) current smokers (mean pack years: ", round(mean(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0 & df_risk$group == Arm2]), 1),", IQR: ", sum_smoke_lwr_group2, "-", sum_smoke_upr_group2, ") in the ", Arm2, " arm.", sep = ''), paste("There were ", nrow(df_risk[!is.na(df_risk$pack_years) & df_risk$pack_years != 0,]), " (", round(100*nrow(df_risk[!is.na(df_risk$pack_years) & df_risk$pack_years != 0,])/part_enrol),"%) current smokers (mean pack years: ", round(mean(df_risk$pack_years[!is.na(df_risk$pack_years) & df_risk$pack_years != 0]), 1),", IQR: ", sum_smoke_lwr, "-", sum_smoke_upr, ").", sep = ''))`|
|||
| | *Prevalence of medical conditions* |
| | Diabetes mellitus:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_dm" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_dm" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_dm" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_dm" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_dm"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_dm"]) / part_enrol, 1), "%)", sep = ''))`|
| | Hypertension:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_hyptens" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_hyptens" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_hyptens" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_hyptens" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_hyptens"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_hyptens"]) / part_enrol, 1), "%)", sep = ''))`|
| | Asthma:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_asthma" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_asthma" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_asthma" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_asthma" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_asthma"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_asthma"]) / part_enrol, 1), "%)", sep = ''))`|
| | Chronic obstructive pulmonary disorder (COPD):|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_copd" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_copd" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_copd" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_copd" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_copd"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_copd"]) / part_enrol, 1), "%)", sep = ''))`|
| | Other lung diseases:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_otherlung" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_otherlung" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_otherlung" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_otherlung" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_otherlung"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_otherlung"]) / part_enrol, 1), "%)", sep = ''))`|
| | Cardiovascular disease:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_cvd" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_cvd" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_cvd" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_cvd" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_cvd"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_cvd"]) / part_enrol, 1), "%)", sep = ''))`|
| | Kidney disease:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_kd" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_kd" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_kd" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_kd" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_kd"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "medhis_kd"]) / part_enrol, 1), "%)", sep = ''))`|
| | Bacillus Calmette-Gu$\'e$rin (BCG) scar:|
||`r ifelse(CLOSED == T, paste(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "bcg_scar" & df_med_risk_sum$group == Arm1][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "bcg_scar" & df_med_risk_sum$group == Arm1][[1]] / part_group1, 1), "%) in the ", Arm1, " arm, ", df_med_risk_sum$prevalence[df_med_risk_sum$risk == "bcg_scar" & df_med_risk_sum$group == Arm2][[1]], " (", round(100 *  df_med_risk_sum$prevalence[df_med_risk_sum$risk == "bcg_scar" & df_med_risk_sum$group == Arm2][[1]] / part_group2, 1), "%) in the ", Arm2, " arm", sep = ''), paste(sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "bcg_scar"]), " (", round(100 *  sum(df_med_risk_sum$prevalence[df_med_risk_sum$risk == "bcg_scar"]) / part_enrol, 1), "%)", sep = ''))`|
|||
|**Primary endpoint** | `r ifelse(CLOSED == T, paste(n_hosp_c19_group1, " (", round(100 * n_hosp_c19_group1 / part_group1, 1), "%) participants have been hospitalized with a positive COVID-19 PCR test result in the ", Arm1, " arm.", sep = ''), paste(n_hosp_c19, " (", round(100 * n_hosp_c19 / part_enrol, 1), "%) participants have been hospitalized with a positive COVID-19 PCR test result.", sep = ''))` |
|| `r ifelse(CLOSED == T, paste(n_hosp_c19_group2, " (", round(100 * n_hosp_c19_group2 / part_group2, 1), "%) participants have been hospitalized with a positive COVID-19 PCR test result in the ", Arm2, " arm.", sep = ''), '')` |
|||
|| `r ifelse(CLOSED == T, paste("A Cox proportional hazard model was utilized to assess the statistical significance of the treatment arm on the primary endpoint. The hazard ratio was ", round(summary(coxph(formula = Surv(time, status) ~ group, data = hosp_c19_km_censor_SAP))$coefficients[2],2), " (", round(exp(confint(coxph(formula = Surv(time, status) ~ group, data = hosp_c19_km_censor_SAP))[1]), 2), "-", round(exp(confint(coxph(formula = Surv(time, status) ~ group, data = hosp_c19_km_censor_SAP))[2]), 1) , " 95% confidence interval) for the ", Arm2, " arm relative to the ", Arm1, " arm (p-value = ", round(summary(coxph(formula = Surv(time, status) ~ group, data = hosp_c19_km_censor_SAP))$coefficients[5], 3),"),", sep = ""), "")` |
|| The primary endpoint is here reported based on the intention-to-treat population with right censoring only at withdrawal of consent or lost to follow-up, death, or the end of the trial. |
|||
|**Coronavirus disease 2019 (COVID-19)** | `r ifelse(CLOSED == T, paste("In total, ", part_c19_group1, " (", round(100 * part_c19_group1 / part_group1, 1), "%) participants had a COVID-19 diagnosis in the ", Arm1, " arm. In total, ", part_c19_group2, " (", round(100 * part_c19_group2 / part_group2, 1), "%) participants had a COVID-19 diagnosis in the ", Arm2, " arm.", sep = ''), paste("In total, ", part_c19, " (", round(100 * part_c19 / part_enrol, 1), "%) participants had a COVID-19 diagnosis.", sep = ''))` |
||*COVID-19 was diagnosed as respiratory tract infection symptoms with a positive PCR-test.*|
||  |
|**Efficacy summary** | `r ifelse(CLOSED == T, paste(n_ae_rti_group1, " respiratory tract infection events occurred in ", part_ae_rti_group1, " (", round(100 * part_ae_rti_group1 / part_group1, 1), "%) participants in the ", Arm1, " arm, and ", n_ae_rti_group2, " respiratory tract infection events occurred in ", part_ae_rti_group2, " (", round(100 * part_ae_rti_group2 / part_group2, 1), "%) participants in the ", Arm2, " arm.", sep = ""), paste(n_ae_rti, " respiratory tract infection events occurred in ", part_ae_rti, " (", round(100 * part_ae_rti / part_enrol, 1), "%) participants.", sep = ""))` |
|| |
|| `r ifelse(CLOSED == T, paste(n_hosp_group1, " (", round(100 * n_hosp_group1 / part_group1, 1), "%) participants were hospitalized in the ", Arm1, " arm, and ", n_hosp_group2, " (", round(100 * n_hosp_group2 / part_group2, 1), "%) participants were hospitalized in the ", Arm2, " arm.", sep = ''), paste(n_hosp, " (", round(100 * n_hosp / part_enrol, 1), "%) participants were hospitalized.", sep = ''))`  |
||  |
|| [Health status over time per individual participant is shown over time in this data report][Health status over time for respiratory tract infections]. |
||   |
|**Safety summary** | `r n_ae` total adverse events (including respiratory tract infections, see above) occurred in `r part_ae` participants.|
| | `r n_ae_isr` injection site reaction (ISR) adverse events occurred in `r part_ae_isr` (`r round(100*part_ae_isr/part_enrol, 1)`%) participants.|
| | `r ifelse(CLOSED == T, paste(n_ae_other_group1, "other adverse events occurred in", part_ae_other_group1, "participants in the", Arm1, "arm, and",n_ae_other_group2, "other adverse events occurred in", part_ae_other_group2, "participants in the", Arm2, "arm."), paste(n_ae_other, "other adverse events occurred in", part_ae_other, "participants"))` |
|||
|| `r ifelse(CLOSED == T, paste("Of the ", n_ae_rti, " respiratory tract infection events, ", n_ae_rti_mild, " (", round(100 * n_ae_rti_mild / n_ae_rti), "%, ", ae_rti_mild %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_rti_mild %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered mild and ", n_ae_rti_moderate, " (", round(100 * n_ae_rti_moderate / n_ae_rti), "%, ", ae_rti_moderate %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_rti_moderate %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered moderate.", sep = ''), paste("Of the ", n_ae_rti, " respiratory tract infection events, ", n_ae_rti_mild, " (", round(100 * n_ae_rti_mild / n_ae_rti), "%) were considered mild and ", n_ae_rti_moderate, " (", round(100 * n_ae_rti_moderate / n_ae_rti), "%) were considered moderate.", sep = ''))` |
|| Of the `r n_ae_isr` injection site reaction adverse events, `r n_ae_isr_mild` (`r round(100 * n_ae_isr_mild / n_ae_isr)`\%) were considered mild and `r n_ae_isr_moderate` (`r round(100 * n_ae_isr_moderate / n_ae_isr)`\%) were considered moderate. |
|| `r ifelse(CLOSED == T, paste("Of the ", n_ae_other, " other adverse events, ", n_ae_other_mild, " (", round(100 * n_ae_other_mild / n_ae_other), "%, ", ae_other_mild %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_other_mild %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered mild and ", n_ae_other_moderate, " (", round(100 * n_ae_other_moderate / n_ae_other), "%, ", ae_other_moderate %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_other_moderate %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered moderate.", sep = ''),  paste("Of the ", n_ae_other, " other adverse events, ", n_ae_other_mild, " (", round(100 * n_ae_other_mild / n_ae_other), "%) were considered mild and ", n_ae_other_moderate, " (", round(100 * n_ae_other_moderate / n_ae_other), "%) were considered moderate.", sep = ''))` |
|||
| | `r n_sae_previous` serious adverse events were reported in the previous DSMB report.|
| | `r ifelse(n_sae - n_sae_previous == 0, "There have been no additional serious adverse events since the last DSMB meeting.", ifelse(n_sae - n_sae_previous == 1, paste(n_sae - n_sae_previous, "additional serious adverse events was reported since the last DSMB meeting."), paste(n_sae - n_sae_previous, "additional serious adverse events were reported since the last DSMB meeting.")))` |
|| `r ifelse(CLOSED == T, paste("In total,", n_sae_group1, "serious adverse events were reported in", part_sae_group1, "participants in the ", Arm1, " arm, and ", n_sae_group2, "serious adverse events were reported in", part_sae_group2, "participants in the ", Arm2, " arm."), paste("In total,", n_sae, "serious adverse events were reported in", part_sae, "participants."))` |
|| SAEs were `r paste(ifelse(n_sae_unrelated > 0, 'unrelated', ''), ifelse(n_sae_unlikelyrelated > 0, ', or unlikely related', ''), ifelse(n_sae_possiblyrelated > 0, ', or possibly related', ''), ifelse(n_sae_probablyrelated > 0, ', or probably related', ''), ifelse(n_sae_definitely > 0, ', or definitely related', ''), sep = '')` to the intervention. |
|||
|**Other COVID-19 vaccinations** | `r ifelse(CLOSED == T, paste(part_vaccine_C19_group1, " (", round(100 * part_vaccine_C19_group1 / part_group1, 1), "%) participants recorded a SARS-CoV-2 specific vaccination since the start of the trial in the ", Arm1, " arm.", sep = ''), paste(part_vaccine_C19, " (", round(100 * part_vaccine_C19 / part_enrol, 1), "%) participants recorded a SARS-CoV-2 specific vaccination since the start of the trial.", sep = ''))` | 
|| `r ifelse(CLOSED == T, paste(part_vaccine_C19_group2, " (", round(100 * part_vaccine_C19_group2 / part_group2, 1), "%) participants recorded a SARS-CoV-2 specific vaccination since the start of the trial in the ", Arm2, " arm.", sep = ''), "")` | 
|||
|**Protocol deviations** | `r n_deviations` protocol deviations associated with `r part_deviations` participants have been reported. |
| | `r ifelse(n_deviations_safety == 0, "None of the deviations impacted participant safety.", paste(n_deviations_integrity, "of the deviations impacted participant safety."))`|
| | `r ifelse(n_deviations_integrity == 0, "None of the deviations impacted scientific integrity.", paste(n_deviations_integrity, "of the deviations impacted scientific integrity."))`|
|||
|**Quality management data** | Quality management reviews were performed by TASK and were last completed on `r format(as.Date(date_last_review), '%A, %d %B %Y')`.|
|| In `r n_qc_yes` of the reviewed data entries (`r round(100* n_qc_yes/n_qc)`\%), a correction was needed. |
|||
|**Quality control data analysis** | Quality control review of data analysis scripts were last completed on `r format(as.Date(date_last_review_data_analysis), '%A, %d %B %Y')`, with no major findings.|

\newpage

# Glossary

**$\alpha$**
: Significance level for statistical analysis

**$\beta$**
: Probability of type II error ($1-\beta = power$) 

AE
: Adverse event

BCG 
: Bacillus Calmette-Gu$\'e$rin (vaccine)

BMI 
: Body mass index

Censored observations
: The event of interest is not observed within the study because of for example drop-out of the participant or the study ends before the event of interest occurs 

COPD 
: Chronic obstructive pulmonary disorder

COVID-19 
: Coronavirus disease 2019

Cox proportional hazards model
: Statistical model for survival analysis (see: survival analysis) of for example hospitalization rate, with the objective to link survival to risk factors (both continuous and categorical covariates)

DSMB 
: Data safety monitoring board

eCRF 
: Electronic case report form 

FU 
: Follow-up on for example recorded events

Hazard rate
: Probability of the event of interest happening

HR
: Hazard ratio, quantifying the ratio of the hazard of an event in one arm compared to another (for example intervention arm, or for different risk factors). A $HR > 1$ reflects an increasing probability of the event and a decreased survival, and $HR < 1$ reflects a decreasing probability of the event and a increased survival.

HCW 
: Health care worker

HIV 
: Human immunodeficiency virus

HS 
: Health status used to characterize seriousness of events from healthy (0) to fatal (7)

ICF 
: Informed consent form

IgG 
: Immunoglobulin G, the antibody used to test for previous SARS-CoV-2 infections

IGRA
: Interferon-gamma release assay, used to test for tuberculosis infections

IP 
: Investigational product

IQR 
: Interquartile range (25th-50th percentile) to capture variability in variable

ISR 
: Injection site reaction

Kaplan-Meier plot
: Figure to visualize survival (proportion) as function of time and variables (for example intervention arm, risk factor)

Markov model
: Modelling approach of states (for example health status scores) where the future state only depends on the current state 

MedDRA 
: Medical dictionary for regulatory activities

NA 
: Not available

NEC 
: Not elsewhere classified

PPE 
: Personal protective equipment

QC 
: Quality control of for example the data analysis script

RTI 
: Respiratory tract infection

SAE 
: Serious adverse event

SAHPRA 
: South African Health Product Regulatory Authority

SARS-CoV-2 
: Severe acute respiratory syndrome coronavirus 2

Survival analysis
: Analysis of the duration of time before an event happens

TB 
: Tuberculosis (original indication for BCG vaccine)


\newpage

# Protocol summary

| | | 
|--- | --------- |
|**Rationale** | A novel betacoronavirus, SARS-CoV-2, is spreading rapidly throughout the world. A large epidemic in South Africa may overwhelm available hospital capacity and healthcare resources which would be worsened by absenteeism of healthcare workers and other frontline staff (HCW). Strategies to prevent morbidity and mortality of HCW are desperately needed to safeguard continuous patient care. Bacillus Calmette-Gu$\'e$rin (BCG) is a vaccine against tuberculosis (TB), with protective non-specific effects against other respiratory tract infections in *in vitro* and *in vivo* studies, with reported morbidity and mortality reductions as high as 70\%.|
|||
|| Due to the novel nature of COVID-19, the impact of genetics on infection, disease severity and outcomes is not well understood. A series of exploratory sub-studies on blood and saliva-samples from a subset of participants will be performed focusing on epigenetic changes and immunogenomics across groups. |
|||
|**Hypothesis** | BCG vaccination may reduce the morbidity and mortality of HCW during the SARS-CoV-2 outbreak in South Africa.|
|||
|**Study design** | A double-blinded, randomised controlled trial to compare the efficacy of BCG vaccination versus placebo for reducing HCW morbidity and mortality during 52 weeks of follow-up among HCW with risk of exposure to SARS-CoV-2 infected patients.|
|||
|**Duration** | 52 weeks of follow-up after enrolment.|
|||
|**Intervention** | Participants are randomized to administration of BCG vaccine or placebo in a 1:1 ratio.|
|||
|**Study population** | A minimum of 500 HCW, defined as nurses, medical doctors and support personnel and other frontline staff (essential workers), working in a facility or at a service where they are likely to have contact with SARS-CoV-2 infected patients. For simplicity "HCW" is used for all these participants.|
|||
|**Risk and benefits** | Based on previous experience and randomized controlled trials in adult and elderly individuals, the risks of BCG vaccination are considered low. The objective of this trial is to evaluate the beneficial effects of BCG vaccination through a lower hospital admission of HCW and/or a mitigated clinical course of COVID-19. |
|||

\newpage

# Report overview

`r ifelse(CLOSED == TRUE, "**This is the closed report with observations per arm, please close if you are not permitted to read this.**", "**This is the open report without observations per arm.**")`

The purpose of this report is to review cumulative enrolment, primary endpoint, efficacy data, and safety data for the "BCG re-vaccination for healthcare workers in SARS-CoV-2 pandemic" trial and reflects data from the database as of `r date_last_review`.

`r ifelse(n_dsmb == 1, paste('There has been ', n_dsmb, ' completed DSMB meeting for this study and the last meeting was held on ', date_last_dsmb, '.', sep = ''), paste('There have been ', n_dsmb, ' completed DSMB meetings for this study and the last meeting was held on ', date_last_dsmb, '.', sep = ''))`

Readers of this report are asked to maintain the confidentiality of the information provided in this report.

\newpage

# Response to most recent DSMB requests

1. List of response to DSMB requests

\newpage

# Enrolment status

`r part_screen` participants screened for this study.

`r part_enrol_noQC` participants were enrolled.

`r part_enrol` participants were enrolled and data underwent quality control and were included in this report.

The trial is at 100\% and the database has been locked.

<!-- `r ifelse(CLOSED == T, paste('Trial progress is at ', round(100 * 2 * df_max_cum_partweek$max_cum_partweek[df_max_cum_partweek$group == Arm1], 1), '% cumulative participant-weeks in the ", Arm1, " arm, and ', round(100 * 2 * df_max_cum_partweek$max_cum_partweek[df_max_cum_partweek$group == Arm2], 1), '% cumulative participant-weeks in the ", Arm2, " arm.', sep = ''), paste('Trial progress is at ', round(100 * df_max_cum_partweek$max_cum_partweek, 1), '% cumulative participant-weeks.', sep = ''))` -->

<!-- The following enrolment graph shows the cumulative number of enrolled participants over time`r ifelse(CLOSED == T, " stratified per arm", "")`. -->

```{r enrollment graph, eval = F, echo = F}
# if(CLOSED == T){
#   placeholder = quo(group) #create placeholder for closed report, to stratify on group
# } else {
#   placeholder = quo(trunc(group-1.5)) #use trunc function to round both -0.5 and 0.5 to zero (1-1.5 = -0.5, 2-1.5 = 0.5)--> both arms become 0
# } 

df_full %>%
  distinct(PID, .keep_all = T) %>% #remove double records per ID
  group_by(date_vaccination, group) %>% #group to count enrollment per date and group
  summarise(enrolled = n()) %>%
  group_by(group) %>% #group to count cumulative enrollment per group
  mutate(cum_enrolled = cumsum(enrolled),
         Date = as.Date(date_vaccination)) %>%
  full_join(df_full %>% 
      arrange(final_date) %>% #make sure final date is not NA when distinct PID in next row which takes the first record
      distinct(PID, .keep_all = T) %>% 
      group_by(final_date, group) %>% 
      summarise(final = n()) %>% 
      group_by(group) %>% 
      arrange(final_date) %>% 
      mutate(cum_intrial0 = cumsum(final),
             Date = final_date), by = c('Date', 'group')) %>% 
  arrange(Date) %>% 
  mutate(enrol0 = ifelse(is.na(final), cum_enrolled, NA)) %>% 
  fill(enrol0, .direction = 'down') %>% 
  mutate(enrol = ifelse(is.na(enrolled), lag(enrol0) - cum_intrial0, enrol0)) %>% 
ggplot(aes(Date, enrol, col = factor(group))) +
  geom_line() +
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  scale_x_date(name = 'Enrolment date') +
  scale_y_continuous(name = 'Cumulative number of enrolled participants') +
  theme_bw() +
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))

```

### Screening

`r df_full %>% filter(meet_crit == 'Yes') %>% distinct(PID) %>% nrow()` participants met all inclusion criteria and no exclusion criteria.

### Inclusion criteria

`r nrow(df_incl_crit[df_incl_crit$incl1 == 'Yes',])` participants met inclusion criterion 1 (>18 year). 

`r nrow(df_incl_crit[df_incl_crit$incl2 == 'Yes',])` participants met inclusion criterion 2 (health care worker). 

`r nrow(df_incl_crit[df_incl_crit$incl3 == 'Yes',])` participants met inclusion criterion 3 (informed consent). 

`r nrow(df_incl_crit[df_incl_crit$incl4 == 'Yes',])` participants met inclusion criterion 4 (mobile phone for follow-up). 

### Exclusion criteria

`r nrow(df_excl_crit[df_excl_crit$excl1 == 'Yes',])` participants met exclusion criterion 1 (BCG vaccine allergy). 

`r nrow(df_excl_crit[df_excl_crit$excl2 == 'Yes',])` participants met exclusion criterion 2 (active TB). 

`r nrow(df_excl_crit[df_excl_crit$excl3 == 'Yes',])` participants met exclusion criterion 3 (HIV-1 positive). 

`r nrow(df_excl_crit[df_excl_crit$excl4 == 'Yes',])` participants met exclusion criterion 4 (respiratory tract infection).

`r nrow(df_excl_crit[df_excl_crit$excl5 == 'Yes',])` participants met exclusion criterion 5 (immunocompromised).  

`r nrow(df_excl_crit[df_excl_crit$excl6 == 'Yes',])` participants met exclusion criterion 6 (pregnancy).

`r nrow(df_excl_crit[df_excl_crit$excl7 == 'Yes',])` participants met exclusion criterion 7 (excluded medication). 

`r nrow(df_excl_crit[df_excl_crit$excl8 == 'Yes',])` participants met exclusion criterion 8 (experimental anti-SARS-CoV-2). 

### Randomization

`r ifelse(CLOSED == T, paste(part_group1, " participants were randomly assigned to the ", Arm1, " arm, ", part_group2, " participants were randomly assigned to the ", Arm2, " arm.", sep = ''), "The participants were randomly assigned to arm 1 or 2 in a 1:1 ratio.")`

<!-- Participants in arm 1 received `r Arm1`.
Participants in arm 2 received `r Arm2`. -->

<!-- This report is blinded regarding the intervention of arm 1 and 2. -->

### Follow-up

The following table reports on the number of participants for which follow-up visit(s) has been recorded.

| Follow-up visit | Number of participants |
| ------ | ------: |
| 1st | `r part_followup_visit_1` |
| 2nd | `r part_followup_visit_2` |
| 3rd | `r part_followup_visit_3` |
| 4th | `r part_followup_visit_4` |
| 5th | `r part_followup_visit_5` |
| 6th | `r part_followup_visit_6` |
| 7th | `r part_followup_visit_7` |
| 8th | `r part_followup_visit_8` |
| 9th | `r part_followup_visit_9` |
| 10th | `r part_followup_visit_10` |
| 11th | `r part_followup_visit_11` |
| 12th | `r part_followup_visit_12` |
| 13th | `r part_followup_visit_13` |
| 14th | `r part_followup_visit_14` |
| 15th | `r part_followup_visit_15` |
| 16th | `r part_followup_visit_16` |
| 17th | `r part_followup_visit_17` |

\newpage

# Sample size

As per the protocol, at least 500 participants were included in the study. 

The primary endpoint was proportion of hospitalization per arm. For a two-sample test (1 test, 1 control) for proportions, a power calculation based on the Pearson chi-square test with continuity correction (for smaller sample sizes) was performed. Here, a sample size calculation for two sample populations with $\alpha = 0.05$ and $\beta = 0.2$ (i.e. 80\% power) was performed. This calculation did not take into account a drop-out rate.

Power calculation resulted in a sample size of 220 participants per arm assuming an attack rate in health care workers of 30%, hospitalisation rate of 20% and a reduction by vaccination of 75%. These assumptions were based on [Liu et al (2020)](https://doi.org/10.1016/S0140-6736(20)30462-1), the case severity reporting from Wuhan, and the reduction in respiratory infections by BCG previously reported by [Wardhana et al (2011)](https://pubmed.ncbi.nlm.nih.gov/21979284/) and [Nemes et al (2018)](https://doi.org/10.1056/nejmoa1714021), respectively.

The full detailed power calculations supporting the sample size can be found in [Appendix A][Appendix A: Power calculation].

\newpage

# Participant status

<!-- Study data is blinded`r ifelse(CLOSED == T, " and arms are referred to as 1 (first) or 2 (second) ", "")`. -->
Participants in arm 1 received `r Arm1`.
Participants in arm 2 received `r Arm2`.

This report is based on the intention-to-treat population with right censoring only at withdrawal of consent or lost to follow-up, death, or the end of the trial.

## Demographics

The following figures display the baseline demographics of the participants`r ifelse(CLOSED == T, " stratified per arm", "")`. The solid line represents the smooth density.

### Age
```{r demographics age, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3, message = F}
ggplot(df_demographics_numerical, aes(age)) + 
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  scale_x_continuous(name = 'Age (year)') + 
  scale_y_continuous(name = 'Density') + 
  ggtitle(paste('Distribution of age in the trial population', ifelse(CLOSED == T, " stratified per arm\np-value (Mann-Whitney-Wilcoxon) = ", ""), ifelse(CLOSED == T, round(wilcox.test(age ~ group, data = df_demographics_numerical)$p.value,3), ""),  sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = Arm1, '2' = Arm2)))
  facet_grid(~group)
```

<!--### Body weight
```{r demographics weight, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3, message = F, include = F}
ggplot(df_demographics_numerical, aes(weight)) + 
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  scale_x_continuous(name = 'Body weight (kg)') + 
  scale_y_continuous(name = 'Density') + 
  ggtitle(paste('Distribution of body weight in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
```

### Height
```{r demographics height, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3, message = F, include = F}
ggplot(df_demographics_numerical[df_demographics_numerical$height != 999,], aes(height)) + 
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  scale_x_continuous(name = 'Height (cm)') + 
  scale_y_continuous(name = 'Density') + 
  ggtitle(paste('Distribution of height in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
```
-->

### Body mass index (BMI)
```{r demographics BMI, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3, message = F}
df_demographics_numerical %>%
  filter(height != 999) %>% 
  filter(weight != 99.9) %>%
ggplot(aes(BMI)) + 
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  geom_vline(xintercept = c(18.5, 25, 30, 40), linetype = 'dotted') + 
  scale_x_continuous(name = expression(paste("BMI (kg/", m^2, ")", sep = ''))) + 
  scale_y_continuous(name = 'Density') + 
  ggtitle(paste('Distribution of BMI in the trial population', ifelse(CLOSED == T, " stratified per arm\np-value (Mann-Whitney-Wilcoxon) = ", ""), ifelse(CLOSED == T, round(wilcox.test(BMI ~ group, data = df_demographics_numerical)$p.value,3), ""),  sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = Arm1, '2' = Arm2)))
    facet_grid(~group)

```

*Dotted lines delimit categories underweight (<18.5), normal (18.5-24.9), overweight (25-29.9), obese (30-39.9), extremely obese (>40).*

### Currently smoking

If participants self reported as currently smoking, they were asked to report the number of cigarettes packs years they smoke. The distribution is shown below.

```{r demographics smoking, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3, message = F}
ggplot(df_risk, aes(pack_years)) + 
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  scale_x_continuous(name = 'Pack years') + 
  scale_y_continuous(name = 'Density') + 
  ggtitle(paste('Distribution of currently smoking in the trial population', ifelse(CLOSED == T, " stratified per arm\np-value (Mann-Whitney-Wilcoxon) = ", ""), ifelse(CLOSED == T, round(wilcox.test(pack_years ~ group, data = df_risk)$p.value,3), ""),  sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = Arm1, '2' = Arm2)))
  facet_grid(~group)

```

*Additional tabulated demographic data can be found in [Appendix B][Appendix B: Additional summary tables], additional figures can be found in [Appendix C][Appendix C: Additional figures].*

## Risk factors 

The following risk factors/classifiers are identified (dataset variable given in brackets):

- age (see [above][Age])
- gender
- BMI (see [above][Body mass index (BMI)])
- ethnicity
- job category
- medical history of diabetes mellitus (medhis_dm)
- medical history of hypertension (medhis_hyptens)
- medical history of cardiovascular disease (medhis_cvd)
- medical history of asthma (medhis_asthma)
- medical history of COPD (medhis_copd)
- medical history of other lung disease (medhis_otherlung)
- medical history of kidney disease (medhis_kd)
- presence of a BCG scar (bcg_scar)
- currently smoking (pack_years, see [above][Currently smoking])

The full list of potential risk factors/classifiers can be found in [Appendix D][Appendix D: Additional data listings].

Distribution of the identified risk factors/classifiers (absolute and relative) are shown below`r ifelse(CLOSED == T, " stratified per arm", "")`. 

\newpage

```{r risk factors, eval = eval_dsmb, echo = echo_dsmb}
if(CLOSED == T){
  placeholder = quo(group) #create placeholder for closed report, to stratify on group
} else {
  placeholder = quo(trunc(group-1.5)) #use trunc function to round both -0.5 and 0.5 to zero (1-1.5 = -0.5, 2-1.5 = 0.5)--> both arms become 0
} 

lab_categorical <- list('Male' = 'Gender: Male', 
                        'Female' = 'Gender: Female', 
                        'African' = 'Ethnicity: African', 
                        'Caucasian' = 'Ethnicity: Caucasian', 
                        'Coloured' = 'Ethnicity: Coloured', 
                        'Indian' = 'Ethnicity: Indian', 
                        'Other' = 'Ethnicity: Other', 
                        'Doctor' = 'Profession: Doctor', 
                        'Nurse' = 'Profession: Nurse', 
                        'Essential_workers' = 'Profession: Essential\nworkers', 
                        'Support_staff' = 'Profession: Support', 
                        'Frontline_workers' = 'Profession: Frontline\nworkers')
lab_medical <- list('bcg_scar' = 'BCG scar', 
                    'medhis_asthma' = 'Asthma', 
                    'medhis_copd' = 'COPD', 
                    'medhis_cvd' = 'Cardiovascular disease', 
                    'medhis_dm' = 'Diabetes mellitus', 
                    'medhis_hyptens' = 'Hypertension', 
                    'medhis_kd' = 'Kidney disease', 
                    'medhis_otherlung' = 'Other lung diseases')

df_risk_categorical_sum %>%
  ggplot(aes(category, count, group = group, fill = factor(group))) + 
  geom_bar(stat = 'identity', position = position_dodge()) +
  scale_x_discrete(labels = lab_categorical, name = '') +
  scale_y_continuous(name = 'Number of participants') + 
  ggtitle('Demographic risk factors/classifiers') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') +
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

df_risk_categorical_sum %>%
  mutate(count_prop = ifelse(group == "Both", 100*count/part_enrol, ifelse(group == Arm1, 100*count/part_group1, ifelse(group == Arm2, 100*count/part_group2, NA)))) %>%
  ggplot(aes(category, count_prop, group = group, fill = factor(group))) + 
  geom_bar(stat = 'identity', position = position_dodge()) +
  scale_x_discrete(labels = lab_categorical, name = '') +
  scale_y_continuous(name = 'Proportion of participants (%)') +
  ggtitle('Demographic risk factors/classifiers') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 
df_med_risk_sum %>%
  filter(risk != 'medhis_cvd_type') %>%
  ggplot(aes(risk, prevalence, group = group, fill = factor(group))) + 
  geom_bar(stat = 'identity', position = position_dodge()) +
  scale_x_discrete(labels = lab_medical, name = '') +
  scale_y_continuous(name = 'Number of participants\nwith risk factor') + 
  ggtitle('Medical risk factors') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

df_med_risk_sum %>%
  filter(risk != 'medhis_cvd_type') %>%
  mutate(prev_prop = ifelse(group == "Both", 100*prevalence/part_enrol, ifelse(group == Arm1, 100*prevalence/part_group1, ifelse(group == Arm2, 100*prevalence/part_group2, NA)))) %>%
  ggplot(aes(risk, prev_prop, group = group, fill = factor(group))) + 
  geom_bar(stat = 'identity', position = position_dodge()) +
  scale_x_discrete(labels = lab_medical, name = '') +
  scale_y_continuous(name = 'Proportion of participants\nwith risk factor (%)') + 
  ggtitle('Medical risk factors') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

```

## Laboratory findings

### Serology for SARS-CoV-2

At week 0, 10, 26, and 52, participants were tested for SARS-CoV-2 infection based on immunoglobulin G (IgG) serology.

`r ifelse(CLOSED == T, paste(n_c19_serology_group1, " (", round(100 * n_c19_serology_group1 / part_c19_serology_group1, 1), "%) participants had at least one positive serology result for SARS-CoV-2 based on IgG in the ", Arm1, " arm.  \n", n_c19_serology_group2, " (", round(100 * n_c19_serology_group2 / part_c19_serology_group2, 1), "%) participants had at least one positive serology result for SARS-CoV-2 based on IgG in the ", Arm2, " arm.", sep = ''), paste(n_c19_serology, " (", round(100 * n_c19_serology / part_c19_serology, 1), "%) participants had at least one positive serology result for SARS-CoV-2 based on IgG.", sep = ''))` 

Full details on the serology findings can be found in [SARS-CoV-2 antibodies][SARS-CoV-2 antibodies].

### Tuberculosis 

At week 0 and 52, participants were tested for TB infection based on an interferon-gamma release assay (IGRA).

#### Week 0  

`r ifelse(CLOSED == T, paste(n_TB_baseline_pos_group1, " (", round(100 * n_TB_baseline_pos_group1 / part_TB_week0_group1, 1), "%) participants had a positive IGRA result for TB at baseline in the ", Arm1, " arm.  \n", n_TB_baseline_pos_group2, " (", round(100 * n_TB_baseline_pos_group2 / part_TB_week0_group2, 1), "%) participants had a positive IGRA result for TB at baseline in the ", Arm2, " arm.", sep = ''), paste(n_TB_baseline_pos, " (", round(100 * n_TB_baseline_pos / part_TB_week0, 1), "%) participants had a positive IGRA result for TB at baseline.", sep = ''))` 

`r ifelse(CLOSED == T, paste(n_TB_baseline_int_group1, " (", round(100 * n_TB_baseline_int_group1 / part_TB_week0_group1, 1), "%) participants had a intermediate IGRA result for TB at baseline in the ", Arm1, " arm.  \n", n_TB_baseline_int_group2, " (", round(100 * n_TB_baseline_int_group2 / part_TB_week0_group2, 1), "%) participants had a intermediate IGRA result for TB at baseline in the ", Arm2, " arm.", sep = ''), paste(n_TB_baseline_int, " (", round(100 * n_TB_baseline_int / part_TB_week0, 1), "%) participants had a intermediate IGRA result for TB at baseline.", sep = ''))` 

`r ifelse(CLOSED == T, paste(n_TB_baseline_neg_group1, " (", round(100 * n_TB_baseline_neg_group1 / part_TB_week0_group1, 1), "%) participants had a negative IGRA result for TB at baseline in the ", Arm1, " arm.  \n", n_TB_baseline_neg_group2, " (", round(100 * n_TB_baseline_neg_group2 / part_TB_week0_group2, 1), "%) participants had a negative IGRA result for TB at baseline in the ", Arm2, " arm.", sep = ''), paste(n_TB_baseline_neg, " (", round(100 * n_TB_baseline_neg / part_TB_week0, 1), "%) participants had a negative IGRA result for TB at baseline.", sep = ''))`

`r ifelse(CLOSED == T, paste(n_TB_baseline_nr_group1, " (", round(100 * n_TB_baseline_nr_group1 / part_TB_week0_group1, 1), "%) participants had a 'no result' IGRA result for TB at baseline in the ", Arm1, " arm.  \n", n_TB_baseline_nr_group2, " (", round(100 * n_TB_baseline_nr_group2 / part_TB_week0_group2, 1), "%) participants had a 'no result' IGRA result for TB at baseline in the ", Arm2, " arm.", sep = ''), paste(n_TB_baseline_nr, " (", round(100 * n_TB_baseline_nr / part_TB_week0, 1), "%) participants had a 'no result' IGRA result for TB at baseline.", sep = ''))`

#### Week 52  

`r ifelse(CLOSED == T, paste(n_TB_week52_pos_group1, " (", round(100 * n_TB_week52_pos_group1 / part_TB_week52_group1, 1), "%) participants had a positive IGRA result for TB at week 52 in the ", Arm1, " arm.  \n", n_TB_week52_pos_group2, " (", round(100 * n_TB_week52_pos_group2 / part_TB_week52_group2, 1), "%) participants had a positive IGRA result for TB at week 52 in the ", Arm2, " arm.", sep = ''), paste(n_TB_week52_pos, " (", round(100 * n_TB_week52_pos / part_TB_week52, 1), "%) participants had a positive IGRA result for TB at week 52.", sep = ''))` 

`r ifelse(CLOSED == T, paste(n_TB_week52_int_group1, " (", round(100 * n_TB_week52_int_group1 / part_TB_week52_group1, 1), "%) participants had a intermediate IGRA result for TB at week 52 in the ", Arm1, " arm.  \n", n_TB_week52_int_group2, " (", round(100 * n_TB_week52_int_group2 / part_TB_week52_group2, 1), "%) participants had a intermediate IGRA result for TB at week 52 in the ", Arm2, " arm.", sep = ''), paste(n_TB_week52_int, " (", round(100 * n_TB_week52_int / part_TB_week52, 1), "%) participants had a intermediate IGRA result for TB at week 52.", sep = ''))` 

`r ifelse(CLOSED == T, paste(n_TB_week52_neg_group1, " (", round(100 * n_TB_week52_neg_group1 / part_TB_week52_group1, 1), "%) participants had a negative IGRA result for TB at week 52 in the ", Arm1, " arm.  \n", n_TB_week52_neg_group2, " (", round(100 * n_TB_week52_neg_group2 / part_TB_week52_group2, 1), "%) participants had a negative IGRA result for TB at week 52 in the ", Arm2, " arm.", sep = ''), paste(n_TB_week52_neg, " (", round(100 * n_TB_week52_neg / part_TB_week52, 1), "%) participants had a negative IGRA result for TB at week 52.", sep = ''))`

`r ifelse(CLOSED == T, paste(n_TB_week52_nr_group1, " (", round(100 * n_TB_week52_nr_group1 / part_TB_week0_group1, 1), "%) participants had a 'no result' IGRA result for TB at week 52 in the ", Arm1, " arm.  \n", n_TB_week52_nr_group2, " (", round(100 * n_TB_week52_nr_group2 / part_TB_week0_group2, 1), "%) participants had a 'no result' IGRA result for TB at week 52 in the ", Arm2, " arm.", sep = ''), paste(n_TB_week52_nr, " (", round(100 * n_TB_week52_nr / part_TB_week0, 1), "%) participants had a 'no result' IGRA result for TB at week 52.", sep = ''))`

#### Week unknown

`r ifelse(CLOSED == T, paste(n_TB_unknown_pos_group1, " (", round(100 * n_TB_unknown_pos_group1 / part_TB_unknown_group1, 1), "%) participants had a positive IGRA result for TB at an unknown week in the ", Arm1, " arm.  \n", n_TB_unknown_pos_group2, " (", round(100 * n_TB_unknown_pos_group2 / part_TB_unknown_group2, 1), "%) participants had a positive IGRA result for TB at an unknown week in the ", Arm2, " arm.", sep = ''), paste(n_TB_unknown_pos, " (", round(100 * n_TB_unknown_pos / part_TB_unknown, 1), "%) participants had a positive IGRA result for TB at an unknown week.", sep = ''))`

`r ifelse(CLOSED == T, paste(n_TB_unknown_int_group1, " (", round(100 * n_TB_unknown_int_group1 / part_TB_unknown_group1, 1), "%) participants had a intermediate IGRA result for TB at an unknown week in the ", Arm1, " arm.  \n", n_TB_unknown_int_group2, " (", round(100 * n_TB_unknown_int_group2 / part_TB_unknown_group2, 1), "%) participants had a intermediate IGRA result for TB at an unknown week in the ", Arm2, " arm.", sep = ''), paste(n_TB_unknown_int, " (", round(100 * n_TB_unknown_int / part_TB_unknown, 1), "%) participants had a intermediate IGRA result for TB at an unknown week.", sep = ''))`

`r ifelse(CLOSED == T, paste(n_TB_unknown_neg_group1, " (", round(100 * n_TB_unknown_neg_group1 / part_TB_unknown_group1, 1), "%) participants had a negative IGRA result for TB at an unknown week in the ", Arm1, " arm.  \n", n_TB_unknown_neg_group2, " (", round(100 * n_TB_unknown_neg_group2 / part_TB_unknown_group2, 1), "%) participants had a negative IGRA result for TB at an unknown week in the ", Arm2, " arm.", sep = ''), paste(n_TB_unknown_neg, " (", round(100 * n_TB_unknown_neg / part_TB_unknown, 1), "%) participants had a negative IGRA result for TB at an unknown week.", sep = ''))`

`r ifelse(CLOSED == T, paste(n_TB_unknown_nr_group1, " (", round(100 * n_TB_unknown_nr_group1 / part_TB_unknown_group1, 1), "%) participants had a 'no result' IGRA result for TB at an unknown week in the ", Arm1, " arm.  \n", n_TB_unknown_nr_group2, " (", round(100 * n_TB_unknown_nr_group2 / part_TB_unknown_group2, 1), "%) participants had a 'no result' IGRA result for TB at an unknown week in the ", Arm2, " arm.", sep = ''), paste(n_TB_unknown_nr, " (", round(100 * n_TB_unknown_nr / part_TB_unknown, 1), "%) participants had a 'no result' IGRA result for TB at an unknown week.", sep = ''))`

#### Active tuberculosis

`r ifelse(CLOSED == T, paste(n_active_TB_group1, " (", round(100 * n_active_TB_group1  / part_group1, 1), "%) participants had a case of active TB during the trial in the ", Arm1, " arm.  \n", n_active_TB_group2, " (", round(100 * n_active_TB_group2 / part_group2, 1), "%) participants had a case of active TB during the trial in the ", Arm2, " arm.", sep = ''), paste(n_active_TB, " (", round(100 * n_active_TB / part_enrol, 1), "%) participants had a case of active TB during the trial.", sep = ''))` 

\newpage

## SARS-CoV-2 specific vaccination 

Vaccination of trial participants with SARS-CoV-2 specific vaccines is reported below. 

`r ifelse(CLOSED == T, paste(part_vaccine_C19_group1, " (", round(100 * part_vaccine_C19_group1 / part_group1, 1), "%) participants recorded a SARS-CoV-2 specific vaccination since the start of the trial in the ", Arm1, " arm.  \n", part_vaccine_C19_group2, " (", round(100 * part_vaccine_C19_group2 / part_group2, 1), "%) participants recorded a SARS-CoV-2 specific vaccination since the start of the trial in the ", Arm2, " arm.", sep = ''), paste(part_vaccine_C19, " (", round(100 * part_vaccine_C19 / part_enrol, 1), "%) participants recorded a SARS-CoV-2 specific vaccination since the start of the trial.", sep = ''))`

Kaplan-Meier estimates of the proportion of subject which were not vaccinated with a SARS-CoV-2 specific vaccine versus time. `r ifelse(CLOSED == T, "The figure is stratified by study arm. ", "")`The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. Vertical lines show censoring events (withdrawal or lost to follow-up, or death, whichever comes first). The table below the figure shows the number of subjects still elligble for SARS-CoV-2 specific vaccination`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. 

```{r KM plot C19-vaccination, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
#censoring KM

df_PID_nocensor <- df_full %>% 
  select(PID, date_vaccination, final_date) %>% 
  arrange(PID, final_date) %>% #in case multiple final dates, pick the first one
  distinct(PID, .keep_all = T) %>% 
  mutate(time = ifelse(is.na(final_date), as.numeric(as.Date(date_cut_off, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days'), as.numeric(as.Date(final_date, format = '%Y-%m-%d') - as.Date(date_vaccination, format = '%Y-%m-%d'), unit = 'days'))) %>% 
  mutate(censor_date = NA,
         Covid_VAC_name = NA,
         censor = NA, 
         status = 0) %>% 
  select(names(df_PID_censor)) %>% 
  filter(!(PID %in% (df_PID_censor$PID))) #only keep the records of those that are not censored
  
df_PID_KM_vaccination <- rbind(
  
  #the PIDs that are censored
  df_PID_censor %>% 
           mutate(status = ifelse(censor == 1, 1, status)) %>%  #keep censoring at 0 for withdrawal and death, but 1 for vaccination
           arrange(PID, time) %>% #order PID and time so the first censoring will be kept
           distinct(PID, .keep_all = T),
    df_PID_nocensor             
  ) %>% 
  left_join(PID_group)

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = df_PID_KM_vaccination)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')


ggsurvplot(fit, data = df_PID_KM_vaccination,
           palette = col_vir,
           ggtheme = theme_bw(),
           xlab = 'Time since BCG vaccination (weeks)',
           ylab = 'Total proportion not vaccinated\nwith a specific SARS-CoV-2 vaccine',
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot for SARS-CoV-2 vaccination',
           conf.int = TRUE,
           censor = T)

```


\newpage

# Primary endpoint

The primary endpoint was proportion of hospitalization due to COVID-19 per arm.

## Hospitalization due to COVID-19

`r ifelse(CLOSED == T, paste(n_hosp_c19_group1, " (", round(100 * n_hosp_c19_group1 / part_group1, 1), "%) participants were hospitalized in the ", Arm1, " arm with a positive COVID-19 PCR test result.  \n", n_hosp_c19_group2, " (", round(100 * n_hosp_c19_group2 / part_group2, 1), "%) participants were hospitalized in the ", Arm2, " arm with a positive COVID-19 PCR test result.", sep = ''), paste(n_hosp_c19, " (", round(100 * n_hosp_c19 / part_enrol, 1), "%) participants were hospitalized with a positive COVID-19 PCR test result.", sep = ''))` 

The endpoint is here reported based on the intention-to-treat population with right censoring only at withdrawal of consent or lost to follow-up, death, or the end of the trial.

Kaplan-Meier estimates of the proportion of subject which were not hospitalized with a positive COVID-19 PCR test result versus time. `r ifelse(CLOSED == T, "The figure is stratified by study arm. ", "")`The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. A figure with a enhanced y-axis is shown as well for clearer visualization. 


```{r KM plot hospitalization C19, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hosp_c19_km_censor_SAP)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hosp_c19_km_censor_SAP,
           palette = col_vir,
           ggtheme = theme_bw(),
           xlab = 'Time since vaccination (weeks)',
           ylab = 'Total proportion not hospitalized\nwith a positive COVID-19 test result',
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot for first hospitalization\nwith a positive COVID-19 test result',
           conf.int = TRUE,
           censor = T)

```

```{r KM plot hospitalization C19-2, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hosp_c19_km_censor_SAP)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hosp_c19_km_censor_SAP,
           palette = col_vir,
           ggtheme = theme_bw(),
           xlab = 'Time since vaccination (weeks)',
           ylab = 'Total proportion not hospitalized\nwith a positive COVID-19 test result',
           ylim = c(0.95, 1),
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot (enhanced y-axis) for first hospitalization\nwith a positive COVID-19 test result',
           conf.int = TRUE,
           censor = T)


```

### Primary endpoint per site

The primary endpoint of hospitalization due to COVID-19 is reported below per site.

```{r table hospitalization C19 site, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

if(CLOSED == T){

hosp_c19 %>% 
  separate(PID, into = c('temp', 'ID'), sep = 'BCG', remove = F) %>% 
  group_by(site, group) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  pivot_wider(names_from = site, values_from = n) %>% 
  kable()
  
} else {

hosp_c19 %>% 
  separate(PID, into = c('temp', 'ID'), sep = 'BCG', remove = F) %>% 
  group_by(site, group) %>% 
  summarize(n = n()) %>% 
  ungroup() %>% 
  pivot_wider(names_from = site, values_from = n) %>% 
  kable()   
  
}

```

Kaplan-Meier estimates of the proportion of subject which were not hospitalized with a positive COVID-19 PCR test result versus time. `r ifelse(CLOSED == T, "The figure is stratified by study arm. ", "")`The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. A figure with a enhanced y-axis is shown as well for clearer visualization. 

```{r KM plot hospitalization C19 per site, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + site, data = hosp_c19_km_censor_SAP)
col_vir <- viridis(n = 6, end = 0.75, option = 'cividis')

km_site <- ggsurvplot(fit2, data = hosp_c19_km_censor_SAP,
           palette = col_vir,
           ggtheme = theme_bw(),
           xlab = 'Time since vaccination (weeks)',
           ylab = 'Total proportion not hospitalized\nwith a positive COVID-19 test result',
           ylim = c(0.8, 1),
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot (enhanced y-axis) for first hospitalization\nwith a positive COVID-19 test result',
           conf.int = TRUE,
           censor = T)

km_site$plot + 
  facet_grid(~site) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))
```

```{r KM plot hospitalization C19 per site table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_site$table
```

*Kaplan-Meier plots stratified per risk factor can be found in [Appendix E][Appendix E: Hospitalization by risk factor/classifier].*

```{r Cox PH hospitalization treatment, eval = CLOSED, echo = echo_dsmb, message = F, fig.height=5}

#ITT

hosp_c19_km_itt <- hosp_c19_km_itt %>% 
  mutate(treatment = factor(group))

m0_group_itt <- coxph(formula = Surv(time, status) ~ treatment, data = hosp_c19_km_itt) 

##PP

hosp_c19_km_pp <- hosp_c19_km_pp %>% 
  mutate(treatment = factor(group))

m0_group_pp <- coxph(formula = Surv(time, status) ~ treatment, data = hosp_c19_km_pp) 


```

### Primary endpoint analysis

A Cox proportional hazard model was utilized to assess the statistical significance of the treatment arm on the primary endpoint. The dataset contains the intention-to-treat population and included censoring because of withdrawal from the trial or lost to follow-up, or death, as shown above in the Kaplan-Meier curves.

The hazard ratio was `r round(summary(m0_group_itt)$coefficients[2], 2)` (`r round(exp(confint(m0_group_itt))[1], 2)`-`r #round(exp(confint(m0_group_itt))[2], 1)` 95\% confidence interval) for the `r Arm2` arm relative to the `r Arm1` arm (p-value = `r round(summary(m0_group_itt)$coefficients[5], 3)`). 

The results were not sensitive to different initial estimates corresponding to a hazard ratio range from 0.1 to 10 (100-fold). The forest plot below visualized the hazard ratio including its 95% confidence interval.

```{r Forest-plot-trt, eval = CLOSED, echo = echo_dsmb, message = F, fig.height=6.5}
ggforest(m0_group_itt,
         data = hosp_c19_km_itt,
         main = 'ITT forest plot showing treatment hazard ratio for the primary endpoint analysis\n(Cox propotional hazard model for first hospitalization due to COVID-19)',
         noDigits = 2)

ggforest(m0_group_pp,
         data = hosp_c19_km_pp,
         main = 'PP forest plot showing treatment hazard ratio for the primary endpoint analysis\n(Cox propotional hazard model for first hospitalization due to COVID-19)',
         noDigits = 2)

```
*AIC = Akaike information criterion*

\newpage

# Secondary endpoints

The table below shows the secondary endpoints and their corresponding section of the report.

| Secondary objective | Description | Report section 
| --- | ------------ | ------ | 
| a | To determine the incidence of SARS-CoV-2 infection in HCW by molecular or serological testing (as available) at entry, 10, 26 and/or 52 weeks. The timing of serological testing will be adjusted to the availability of validated tests and the course the epidemic of COVID-19 will take. | [SARS-CoV-2 antibodies][SARS-CoV-2 antibodies] (page \pageref{sars-cov-2-antibodies})
| b | To compare the incidence of symptoms of respiratory tract infection per arm. | [Respiratory tract infections events][Respiratory tract infections events] (page \pageref{respiratory-tract-infections-events})
| c | To compare the number of days of (unplanned) absenteeism because of documented SARS-CoV-2 infection or COVID-19 per arm. | [Recorded sick leave during the trial][Recorded sick leave during the trial] (page \pageref{recorded-sick-leave-during-the-trial})
| d | To compare the number of days of (unplanned) absenteeism for any reason per arm. | [Recorded sick leave during the trial][Recorded sick leave during the trial] (page \pageref{recorded-sick-leave-during-the-trial})
| e | To compare the incidence of hospitalization of HCW for any reason per arm. | [Total hospitalization][Total hospitalization] (page \pageref{total-hospitalization})
| f | To compare the incidence of intensive care admission of HCW due to COVID-19 per arm. | [Hospitalized with ventilation (HS = 6)][Hospitalized with ventilation (HS = 6)] (page \pageref{hospitalized-with-ventilation})
| g | To compare the incidence of intensive care admission of HCW for any reason per arm. |[Intentive care admissions][Intentive care admissions] (page \pageref{intentive-care-admissions})
| h | To compare the incidence of death of HCW due to COVID-19 per arm. | [Death (HS = 7)][Death (HS = 7)] (page \pageref{death})
| i | To compare the incidence of death of HCW for any reason per arm. | [Deaths][Deaths] (page \pageref{deaths})
| j | To describe the prevalence of latent TB infection as determined by interferon gamma release assay (IGRA) at enrolment and at week 52. | [Tuberculosis][Tuberculosis] (page \pageref{tuberculosis})
| k | To compare the incidence of active TB of HCW per arm. | [Active tuberculosis][Active tuberculosis] (page \pageref{active-tuberculosis})
| l | To compare the effect of latent TB infection on morbidity and mortality of HCW due to COVID-19 per arm. | [Latent tuberculosis infection and COVID-19 or respiratory tract infections][Latent tuberculosis infection and COVID-19 or respiratory tract infections] (page \pageref{latent-tuberculosis-infection-and-covid-19-or-respiratory-tract-infections})
| m | To compare the incidence of grade 2 or higher adverse events and vaccination site reactions per arm. | [Grade 2 or higher][Grade 2 or higher] (page \pageref{grade-2-or-higher})

\newpage 

# COVID-19 cases

*Please be aware not all participants are tested as per guidelines. Participants can come in with proof of a positive test from a third party (PCR or other). Serology testing at week 0, 10, 26, and 52 showed IgG-based test results of SARS-CoV-2. Health status (HS) score reflects the highest HS recorded per event.*

## Total cases of COVID-19

`r ifelse(CLOSED == T, paste("In total, ", n_c19_event_group1, " COVID-19 cases diagnosed with a positive PCR test have been reported in ", part_c19_group1, " (", round(100 * part_c19_group1 / part_group1, 1), "%) participants in the ", Arm1, " arm.  \nIn total, ", n_c19_event_group2, " COVID-19 cases diagnosed with a positive PCR test have been reported in ", part_c19_group2, " (", round(100 * part_c19_group2 / part_group2, 1), "%) participants in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_event, " COVID-19 cases diagnosed with a positive PCR test have been reported in ", part_c19, " (", round(100 * part_c19 / part_enrol, 1), "%) participants.", sep = ''))`

The endpoint is here reported based on the intention-to-treat population with right censoring only at withdrawal of consent or lost to follow-up, death, or the end of the trial.

Kaplan-Meier estimates of the proportion of subject without a positive COVID-19 PCR test result versus time. `r ifelse(CLOSED == T, "The figure is stratified by study arm. ", "")`The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. A figure with a enhanced y-axis is shown as well for clearer visualization. 

```{r KM plot COVID-19, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = c19_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')


ggsurvplot(fit, data = c19_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           ylab = 'Total proportion without COVID-19',
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot for first COVID-19 event',
           conf.int = TRUE, 
           censor = T)

```
  

```{r KM plot COVID-19-2, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = c19_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')


ggsurvplot(fit, data = c19_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           ylab = 'Total proportion without COVID-19',
           ylim = c(0.75, 1),
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot (enhanced y-axis) for first COVID-19 event',
           conf.int = TRUE, 
           censor = T)

```
  
## Mild (HS = 1)

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS1_group1, " (", round(100 * n_c19_HS1_group1 / part_group1, 1), "%) participants had a positive PCR test result for COVID-19 with mild symptoms in the ", Arm1, " arm.  \nIn total, ", n_c19_HS1_group2, " (", round(100 * n_c19_HS1_group2 / part_group2, 1), "%) participants had a positive PCR test result for COVID-19 with mild symptoms in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS1, " (", round(100 * n_c19_HS1 / part_enrol, 1), "%) participants had a positive PCR test result for COVID-19 with mild symptoms.", sep = ''))`

## Moderate (HS = 2)

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS2_group1, " (", round(100 * n_c19_HS2_group1 / part_group1, 1), "%) participants had a positive PCR test result for COVID-19 with moderate symptoms in the ", Arm1, " arm.  \nIn total, ", n_c19_HS2_group2, " (", round(100 * n_c19_HS2_group2 / part_group2, 1), "%) participants had a positive PCR test result for COVID-19 with moderate symptoms in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS2, " (", round(100 * n_c19_HS2 / part_enrol, 1), "%) participants had a positive PCR test result for COVID-19 with moderate symptoms.", sep = ''))`

## Severe (HS = 3)

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS3_group1, " (", round(100 * n_c19_HS3_group1 / part_group1, 1), "%) participants had a positive PCR test result for COVID-19 with severe symptoms in the ", Arm1, " arm.  \nIn total, ", n_c19_HS3_group2, " (", round(100 * n_c19_HS3_group2 / part_group2, 1), "%) participants had a positive PCR test result for COVID-19 with severe symptoms in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS3, " (", round(100 * n_c19_HS3 / part_enrol, 1), "%) participants had a positive PCR test result for COVID-19 with severe symptoms.", sep = ''))`

## Hospitalized (HS = 4)

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS4_group1, " (", round(100 * n_c19_HS4_group1 / part_group1, 1), "%) hospitalized participants had a positive PCR test result for COVID-19 in the ", Arm1, " arm.  \nIn total, ", n_c19_HS4_group2, " (", round(100 * n_c19_HS4_group2 / part_group2, 1), "%) hospitalized participants had a positive PCR test result for COVID-19 in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS4, " (", round(100 * n_c19_HS4 / part_enrol, 1), "%) hospitalized participants had a positive PCR test result for COVID-19.", sep = ''))`

## Hospitalized with oxygen (HS = 5)

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS5_group1, " (", round(100 * n_c19_HS5_group1 / part_group1, 1), "%) hospitalized participants with oxygen had a positive PCR test result for COVID-19 in the ", Arm1, " arm.  \nIn total, ", n_c19_HS5_group2, " (", round(100 * n_c19_HS5_group2 / part_group2, 1), "%) hospitalized participants with oxygen had a positive PCR test result for COVID-19 in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS5, " (", round(100 * n_c19_HS5 / part_enrol, 1), "%) hospitalized participants with oxygen had a positive PCR test result for COVID-19.", sep = ''))`

## Hospitalized with ventilation (HS = 6) {#hospitalized-with-ventilation}

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS6_group1, " (", round(100 * n_c19_HS6_group1 / part_group1, 1), "%) hospitalized participants with ventilation had a positive PCR test result for COVID-19 in the ", Arm1, " arm.  \nIn total, ", n_c19_HS6_group2, " (", round(100 * n_c19_HS6_group2 / part_group2, 1), "%) hospitalized participants with oxygen had a positive PCR test result for COVID-19 in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS6, " (", round(100 * n_c19_HS6 / part_enrol, 1), "%) hospitalized participants with ventilation had a positive PCR test result for COVID-19.", sep = ''))`

## Death (HS = 7) {#death}

`r ifelse(CLOSED == T, paste("In total, ", n_c19_HS7_group1, " (", round(100 * n_c19_HS7_group1 / part_group1, 1), "%) participant who died had a positive PCR test result for COVID-19 in the ", Arm1, " arm.  \nIn total, ", n_c19_HS7_group2, " (", round(100 * n_c19_HS7_group2 / part_group2, 1), "%) participant who died had a positive PCR test result for COVID-19 in the ", Arm2, " arm.", sep = ''), paste("In total, ", n_c19_HS7, " (", round(100 * n_c19_HS7 / part_enrol, 1), "%) participant who died had a positive PCR test result for COVID-19.", sep = ''))`

## Repeated infection

`r ifelse(CLOSED == T, paste(part_repeated_infect_group1, " (", round(100 * part_repeated_infect_group1 / part_group1, 1), "%) participant had a repeated COVID-19 infection in the ", Arm1, " arm.  \n", part_repeated_infect_group2, " (", round(100 * part_repeated_infect_group2 / part_group2, 1), "%) participant had a repeated COVID-19 infection in the ", Arm2, " arm.", sep = ''), paste(part_repeated_infect, " (", round(100 * part_repeated_infect / part_enrol, 1), "%) participant had a repeated COVID-19 infection.", sep = ''))`

## Post COVID-19 viral syndrome

`r ifelse(CLOSED == T, paste(nrow(part_post_group1), " (", round(100 * nrow(part_post_group1) / part_c19_group1, 1), "%) participants experienced post COVID-19 viral syndrome in the ", Arm1, " arm.  \n", nrow(part_post_group2), " (", round(100 * nrow(part_post_group2) / part_c19_group2, 1), "%) participants experienced post COVID-19 viral syndrome in the ", Arm2, " arm.", sep = ''), paste(nrow(part_post), " (", round(100 * nrow(part_post) / part_c19, 1), "%) participants experienced post COVID-19 viral syndrome.", sep = ''))`

\newpage

## SARS-CoV-2 antibodies

#### Overall

`r ifelse(CLOSED == T, paste(n_c19_serology_group1, " (", round(100 * n_c19_serology_group1 / part_c19_serology_group1, 1), "%, n=", part_c19_serology_group1, ") participants had at least one positive serology result for SARS-CoV-2 based on IgG in the ", Arm1, " arm.  \n", n_c19_serology_group2, " (", round(100 * n_c19_serology_group2 / part_c19_serology_group2, 1), "%, n=", part_c19_serology_group2, ") participants had at least one positive serology result for SARS-CoV-2 based on IgG in the ", Arm2, " arm.", sep = ''), paste(n_c19_serology, " (", round(100 * n_c19_serology / part_c19_serology, 1), "%, n=", part_c19_serology, ") participants had at least one positive serology result for SARS-CoV-2 based on IgG.", sep = ''))`  

#### Without COVID-19 symptoms

`r ifelse(CLOSED == T, paste(n_c19_serology_nosym_group1, " (", round(100 * n_c19_serology_nosym_group1 / part_c19_serology_group1, 1), "%, n=", part_c19_serology_group1, ") participants had a positive serology result (at baseline or week 10, 26, or 52, or at an unknown week) for SARS-CoV-2 based on IgG but showed no symptoms (no COVID-19 events reported) in the ", Arm1, " arm, assuming that those positive at baseline did not report any symptoms before they were included.  \n", n_c19_serology_nosym_group2, " (", round(100 * n_c19_serology_nosym_group2 / part_c19_serology_group2, 1), "%, n=", part_c19_serology_group2, ") participants had a positive serology result (at baseline or week 10, 26, or 52, or at an unknown week) for SARS-CoV-2 based on IgG but showed no symptoms (no COVID-19 events reported) in the ", Arm2, " arm, assuming that those positive at baseline did not report any symptoms before they were included.", sep = ''), paste(n_c19_serology_nosym, " (", round(100 * n_c19_serology_nosym / part_c19_serology, 1), "%, n=", part_c19_serology, ") participants had a positive serology result (at baseline or week 10, 26, or 52, or at an unknown week) for SARS-CoV-2 based on IgG but showed no symptoms (no COVID-19 events reported), assuming that those positive at baseline did not report any symptoms before they were included.", sep = ''))` 

#### Baseline

`r ifelse(CLOSED == T, paste(n_c19_serology_baseline_group1, " (", round(100 * n_c19_serology_baseline_group1 / part_c19_serology_week0_group1, 1), "%, n=", part_c19_serology_week0_group1, ") participants had a positive serology result for SARS-CoV-2 based on IgG at baseline in the ", Arm1, " arm.  \n", n_c19_serology_baseline_group2, " (", round(100 * n_c19_serology_baseline_group2 / part_c19_serology_week0_group2, 1), "%, n=", part_c19_serology_week0_group2, ") participants had a positive serology result for SARS-CoV-2 based on IgG at baseline in the ", Arm2, " arm.", sep = ''), paste(n_c19_serology_baseline, " (", round(100 * n_c19_serology_baseline / part_c19_serology_week0, 1), "%, n=", part_c19_serology_week0, ") participants had a positive serology result for SARS-CoV-2 based on IgG at baseline.", sep = ''))`  
`r ifelse(CLOSED == T, paste("A Pearson's chi-squared test for the baseline serology between the two groups showed a p-value of ", round(prop.test(x = c(n_c19_serology_baseline_group1, n_c19_serology_baseline_group2), n = c(part_c19_serology_week0_group1, part_c19_serology_week0_group2))$p.value, 3), '.', sep = ''), '')`  
`r ifelse(CLOSED == T, paste('Of these baseline seropositive participants, ', n_rti_serobaseline_event_group1, " (", round(100 * n_rti_serobaseline_event_group1 / n_c19_serology_baseline_group1, 1), "%) participants had an RTI event after enrolment in the ", Arm1, " arm, and ", n_rti_serobaseline_event_group2, " (", round(100 * n_rti_serobaseline_event_group2 / n_c19_serology_baseline_group2, 1), "%) participants had an RTI event after enrolment in the ", Arm2, " arm.", sep = ''), paste('Of these baseline seropositive participants, ', n_rti_serobaseline_event, " (", round(100 * n_rti_serobaseline_event / n_c19_serology_baseline, 1), "%) participants had an RTI event after enrolment.", sep = ''))`  
`r ifelse(CLOSED == T, paste('Of these baseline seropositive participants, ', n_c19_serobaseline_event_group1, " (", round(100 * n_c19_serobaseline_event_group1 / n_c19_serology_baseline_group1, 1), "%) participant had a COVID-19 event after enrolment in the ", Arm1, " arm, and ", n_c19_serobaseline_event_group2, " (", round(100 * n_c19_serobaseline_event_group2 / n_c19_serology_baseline_group2, 1), "%) participant had a COVID-19 event after enrolment in the ", Arm2, " arm.", sep = ''), paste('Of these baseline seropositive participants, ', n_c19_serobaseline_event, " (", round(100 * n_c19_serobaseline_event / n_c19_serology_baseline, 1), "%) participant had a COVID-19 event after enrolment.", sep = ''))` 

#### Week 10

`r ifelse(CLOSED == T, paste(n_c19_serology_week10_group1, " (", round(100 * n_c19_serology_week10_group1 / part_c19_serology_week10_group1, 1), "%, n=", part_c19_serology_week10_group1, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 10 in the ", Arm1, " arm.  \n", n_c19_serology_week10_group2, " (", round(100 * n_c19_serology_week10_group2 / part_c19_serology_week10_group2, 1), "%, n=", part_c19_serology_week10_group2, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 10 in the ", Arm2, " arm.", sep = ''), paste(n_c19_serology_week10, " (", round(100 * n_c19_serology_week10 / part_c19_serology_week10, 1), "%, n=", part_c19_serology_week10, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 10.", sep = ''))` 

#### Week 26 

`r ifelse(CLOSED == T, paste(n_c19_serology_week26_group1, " (", round(100 * n_c19_serology_week26_group1 / part_c19_serology_week26_group1, 1), "%, n=", part_c19_serology_week26_group1, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 26 in the ", Arm1, " arm.  \n", n_c19_serology_week26_group2, " (", round(100 * n_c19_serology_week26_group2 / part_c19_serology_week26_group2, 1), "%, n=", part_c19_serology_week26_group2, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 26 in the ", Arm2, " arm.", sep = ''), paste(n_c19_serology_week26, " (", round(100 * n_c19_serology_week26 / part_c19_serology_week26, 1), "%, n=", part_c19_serology_week26, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 26.", sep = ''))` 

#### Week 52

`r ifelse(CLOSED == T, paste(n_c19_serology_week52_group1, " (", ifelse(part_c19_serology_week52_group1 == 0, 0, round(100 * n_c19_serology_week52_group1 / part_c19_serology_week52_group1, 1)), "%, n=", part_c19_serology_week52_group1, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 52 in the ", Arm1, " arm.  \n", n_c19_serology_week52_group2, " (", ifelse(part_c19_serology_week52_group2 == 0, 0, round(100 * n_c19_serology_week52_group2 / part_c19_serology_week52_group2, 1)), "%, n=", part_c19_serology_week52_group2, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 52 in the ", Arm2, " arm.", sep = ''), paste(n_c19_serology_week52, " (", ifelse(part_c19_serology_week52 == 0, 0, round(100 * n_c19_serology_week52 / part_c19_serology_week52, 1)), "%, n=", part_c19_serology_week52, ") participants had a positive serology result for SARS-CoV-2 based on IgG at week 52.", sep = ''))` 

#### Seroconversion

`r ifelse(CLOSED == T, paste(n_c19_seroconversion_group1, " (", round(100 * n_c19_seroconversion_group1 / part_c19_secondsample_group1, 1), "%, n=", part_c19_secondsample_group1, ") participants had seroconversion for SARS-CoV-2 in the ", Arm1, " arm.  \n", n_c19_seroconversion_group2, " (", round(100 * n_c19_seroconversion_group2 / part_c19_secondsample_group2, 1), "%, n=", part_c19_secondsample_group2, ") participants had seroconversion for SARS-CoV-2 in the ", Arm2, " arm.", sep = ''), paste(n_c19_seroconversion, " (", round(100 * n_c19_seroconversion / part_c19_secondsample, 1), "%, n=", part_c19_secondsample, ") participants had seroconversion for SARS-CoV-2.", sep = ''))`   
`r ifelse(CLOSED == T, paste('Of these seroconverted participants, ', n_rti_seroconversion_event_group1, " (", round(100 * n_rti_seroconversion_event_group1 / n_c19_seroconversion_group1, 1), "%) participants had an RTI event in the seroconversion period (period between last negative and first positive serology result) in the ", Arm1, " arm, and ", n_rti_seroconversion_event_group2, " (", round(100 * n_rti_seroconversion_event_group2 / n_c19_seroconversion_group2, 1), "%) participants had an RTI event in the seroconversion period in the ", Arm2, " arm.", sep = ''), paste('Of these seroconverted participants, ', n_rti_seroconversion_event, " (", round(100 * n_rti_seroconversion_event / n_c19_seroconversion, 1), "%) participants had an RTI event in the seroconversion period (period between last negative and first positive serology result).", sep = ''))` 
`r ifelse(CLOSED == T, paste('Of these seroconverted participants, ', n_c19_seroconversion_event_group1, " (", round(100 * n_c19_seroconversion_event_group1 / n_c19_seroconversion_group1, 1), "%) participants had a COVID-19 event in the seroconversion period in the ", Arm1, " arm, and ", n_c19_seroconversion_event_group2, " (", round(100 * n_c19_seroconversion_event_group2 / n_c19_seroconversion_group2, 1), "%) participants had a COVID-19 event in the seroconversion period in the ", Arm2, " arm.", sep = ''), paste('Of these seroconverted participants, ', n_c19_seroconversion_event, " (", round(100 * n_c19_seroconversion_event / n_c19_seroconversion, 1), "%) participants had a COVID-19 event in the seroconversion period.", sep = ''))` 

*Serology sampling takes place at baseline and at week 10, 26, and 52. Percentages reported are relative to the total number of samples with results reported at that timepoint, or to the number of participants with at least two serology results for the seroconversion (reported as n).*

## Latent tuberculosis infection and COVID-19 or respiratory tract infections

`r ifelse(CLOSED == T, paste("Of the reported COVID-19 cases, ", n_c19_TB_group1, " were preceded by a positive IGRA test for TB in ", part_c19_TB_group1, " (", round(100 * part_c19_TB_group1 / part_c19_group1, 1), "%) participants in the ", Arm1, " arm, and ", n_c19_TB_group2, " in ", part_c19_TB_group2, " (", round(100 * part_c19_TB_group2 / part_c19_group2, 1), "%) participants in the ", Arm2, " arm.", sep = ''), paste("Of the reported COVID-19 cases, ", n_c19_TB, " were preceded by a positive IGRA test for TB in ", part_c19_TB, " (", round(100 * part_c19_TB / part_c19, 1), "%) participants.", sep = ''))`

`r ifelse(CLOSED == T, paste("Of the reported RTI cases, ", n_rti_TB_group1, " were preceded by a positive IGRA test for TB in ", part_rti_TB_group1, " (", round(100 * part_rti_TB_group1 / part_ae_rti_group1, 1), "%) participants in the ", Arm1, " arm, and ", n_rti_TB_group2, " in ", part_rti_TB_group2, " (", round(100 * part_rti_TB_group2 / part_ae_rti_group2, 1), "%) participants in the ", Arm2, " arm.", sep = ''), paste("Of the reported RTI cases, ", n_rti_TB, " were preceded by a positive IGRA test for TB in ", part_rti_TB, " (", round(100 * part_rti_TB / part_ae_rti, 1), "%) participants.", sep = ''))`

\newpage

# Efficacy summary

## Total hospitalization 

Total hospitalization included participants hospitalized (i.e. HS $\geq$ 4) due to COVID-19 (see [primary endpoint][Primary endpoint]) as well as due to other causes.

`r ifelse(CLOSED == T, paste(n_hosp_group1, " (", round(100 * n_hosp_group1 / part_group1, 1), "%) participants were hospitalized in the ", Arm1, " arm.  \n", n_hosp_group2, " (", round(100 * n_hosp_group2 / part_group2, 1), "%) participants were hospitalized in the ", Arm2, " arm.", sep = ''), paste(n_hosp, " (", round(100 * n_hosp / part_enrol, 1), "%) participants were hospitalized.", sep = ''))` 

The endpoint is here reported based on the intention-to-treat population with right censoring only at withdrawal of consent or lost to follow-up, death, or the end of the trial.

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. `r ifelse(CLOSED == T, "The figure is stratified by study arm. ", "")`The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. A figure with a enhanced y-axis is shown as well for clearer visualization. 

```{r KM plot hospitalization, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hosp_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')


ggsurvplot(fit, data = hosp_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           ylab = 'Proportion total not hospitalized',
           break.time.by = 10,
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot for first hospitalization event (all causes)',
           conf.int = TRUE, 
           censor = T)

```

```{r KM plot hospitalization2, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hosp_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')


ggsurvplot(fit, data = hosp_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           ylab = 'Proportion total not hospitalized',
           break.time.by = 10,
           ylim = c(0.55, 1),
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot (enhanced y-axis) for first hospitalization event\n(all causes)',
           conf.int = TRUE, 
           censor = T)
 
```

*Kaplan-Meier plots stratified per risk factor can be found in [Appendix E][Appendix E: Hospitalization by risk factor/classifier].*

## Respiratory tract infections events 

`r ifelse(CLOSED == T, paste(part_ae_rti_group1, " (", round(100 * part_ae_rti_group1 / part_group1, 1), "%) participants had an RTI event in the ", Arm1, " arm.  \n", part_ae_rti_group2, " (", round(100 * part_ae_rti_group2 / part_group2, 1), "%) participants had an RTI event in the ", Arm2, " arm.", sep = ''), paste(part_ae_rti, " (", round(100 * part_ae_rti / part_enrol, 1), "%) participants had an RTI event.", sep = ''))`

The endpoint is here reported based on the intention-to-treat population with right censoring only at withdrawal of consent or lost to follow-up, death, or the end of the trial.

**The numbers in the risk table are based on the number of participants that have reached that timepoint.**

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS>0) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. A figure with a enhanced y-axis is shown as well for clearer visualization. 

```{r KM plot endpoint HS, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs_rti_km_censor_novac)
#SE calculated by the Greenwood calculation, based on the square root of the sum of ratio of 1 by the product of number surviving and the number of surviving minus 1 (see Miettinen, Eur J Epidemiol (2008) 23:585-592)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs_rti_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           risk.table = T, 
           legend.title = '',
           cumevents = T,
           title = 'Kaplan-Meier plot for first RTI event (health score > 0)',
           conf.int = TRUE,
           censor = T
           )

```

```{r KM plot endpoint HS-2, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs_rti_km_censor_novac)
#SE calculated by the Greenwood calculation, based on the square root of the sum of ratio of 1 by the product of number surviving and the number of surviving minus 1 (see Miettinen, Eur J Epidemiol (2008) 23:585-592)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')


ggsurvplot(fit, data = hs_rti_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.35, 1),
           risk.table = T,
           legend.title = '',
           title = 'Kaplan-Meier plot (enhanced y-axis) for first RTI event\n(health score > 0)',
           conf.int = TRUE,
           censor = T
           )
```

*Kaplan-Meier plots for all health status scores can be found in [Appendix F][Appendix F: KM plots for respiratory tract infections].*

\newpage

## Health status 

| Health status | Description | 
| ---: | ---------- |
| 0 | Healthy |
| 1 | Mild symptoms |
| 2 | Moderate symptoms |
| 3 | Severe symptoms |
| 4 | Hospitalized |
| 5 | Hospitalized, oxygen |
| 6 | Hospitalized, ventilated |
| 7 | Death |

### Health status over time for respiratory tract infections

Health status over time for RTIs for each subject in the study (dashed lines).`r ifelse(CLOSED == T, " The figure is stratified and coloured by the study arm.", "")` 

A health status score of 4 or higher means hospitalization of the participant (horizontal dotted line).

```{r health status over time, eval = eval_dsmb, echo = echo_dsmb, message = F}
hs_time %>%
  filter(!is.na(HS)) %>%
  filter(Time >= 0) %>% 
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis', guide = F) + 
  scale_x_continuous(name = 'Time since vaccination (weeks)', breaks = seq(0, 70, by = 10)) +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score for RTI') +
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~group) 

```
\newpage

The figures below shows the proportion of participant-events for each HS over time. Participant-events was used as participants could have multiple events with separate HS at the same time. 

```{r health status over time state plots, eval = eval_dsmb, echo = echo_dsmb, message = F}

hs_time_0to7 %>%
  filter(Time_week >= 0) %>% 
  ggplot(aes(Time_week, proportion, group = group, col = as.character(group))) +
  geom_point() +
  geom_line() +
  scale_x_continuous(name = 'Time since vaccination (weeks)', breaks = seq(0, 70, by = 10)) +
  scale_y_continuous(limits = c(0, NA), name = 'Proportion of participants') +
  facet_wrap(~HS, scales = 'free_y', labeller = 'label_both') +
  scale_color_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis', guide = F) +
  theme_bw()


hs_time_0to7 %>%
  filter(Time_week >= 0) %>% 
  ggplot(aes(Time_week, proportion, group = group, col = as.character(group))) +
  geom_point() +
  geom_line() +
  scale_x_continuous(name = 'Time since vaccination (weeks)', breaks = seq(0, 70, by = 10)) +
  scale_y_continuous(limits = c(0, NA), name = 'Proportion of participants') +
  facet_wrap(~HS, labeller = 'label_both') +
  scale_color_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis', guide = F) +
  theme_bw()


```

### Transitions

The table below shows the number of transitions between the HS states (from the HS identified in the row to the HS identified in the column).

| 	From $\downarrow$ to $\rightarrow$ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 
| ---: |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- | 
| 0 | | `r hs_time_transition_01 %>% nrow()` | `r hs_time_transition_02 %>% nrow()` | `r hs_time_transition_03 %>% nrow()` | `r hs_time_transition_04 %>% nrow()` | `r hs_time_transition_05 %>% nrow()` | `r hs_time_transition_06 %>% nrow()` | `r hs_time_transition_07 %>% nrow()` | 
| 1 | `r hs_time_transition_10 %>% nrow()` | | `r hs_time_transition_12 %>% nrow()` | `r hs_time_transition_13 %>% nrow()` | `r hs_time_transition_14 %>% nrow()` | `r hs_time_transition_15 %>% nrow()` | `r hs_time_transition_16 %>% nrow()` | `r hs_time_transition_17 %>% nrow()` | 
| 2 | `r hs_time_transition_20 %>% nrow()` | `r hs_time_transition_21 %>% nrow()` | | `r hs_time_transition_23 %>% nrow()` | `r hs_time_transition_24 %>% nrow()` | `r hs_time_transition_25 %>% nrow()` | `r hs_time_transition_26 %>% nrow()` | `r hs_time_transition_27 %>% nrow()` | 
| 3 | `r hs_time_transition_30 %>% nrow()` | `r hs_time_transition_31 %>% nrow()` | `r hs_time_transition_32 %>% nrow()` | | `r hs_time_transition_34 %>% nrow()` | `r hs_time_transition_35 %>% nrow()` | `r hs_time_transition_36 %>% nrow()` | `r hs_time_transition_37 %>% nrow()` | 
| 4 | `r hs_time_transition_40 %>% nrow()` | `r hs_time_transition_41 %>% nrow()` | `r hs_time_transition_42 %>% nrow()` | `r hs_time_transition_43 %>% nrow()` | | `r hs_time_transition_45 %>% nrow()` | `r hs_time_transition_46 %>% nrow()` | `r hs_time_transition_47 %>% nrow()` | 
| 5 | `r hs_time_transition_50 %>% nrow()` | `r hs_time_transition_51 %>% nrow()` | `r hs_time_transition_52 %>% nrow()` | `r hs_time_transition_53 %>% nrow()` | `r hs_time_transition_54 %>% nrow()` | | `r hs_time_transition_56 %>% nrow()` | `r hs_time_transition_57 %>% nrow()` | 
| 6 | `r hs_time_transition_60 %>% nrow()` | `r hs_time_transition_61 %>% nrow()` | `r hs_time_transition_62 %>% nrow()` | `r hs_time_transition_63 %>% nrow()` | `r hs_time_transition_64 %>% nrow()` | `r hs_time_transition_65 %>% nrow()` | | `r hs_time_transition_67 %>% nrow()` | 


*Health status scores over time stratified per risk factors can be found in [Appendix H][Appendix H: Health status score over time].*

## Sick leave

### Baseline sick leave

Baseline sick leave recorded at enrolment is plotted below`r ifelse(CLOSED == T, " per arm", "")`. This was the number of days sick leave taken in the four weeks prior to enrolment. 

```{r sick leave baseline, eval = eval_dsmb, echo = echo_dsmb, message  = F}
df_full %>% 
  distinct(PID, group, sickleave_taken_4wks) %>% #select unique PIDs with baseline sick leave taken during 4 weeks prior to screen/enrolment

ggplot(aes(sickleave_taken_4wks)) +
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  scale_x_continuous(name = 'Sick leave (days)') + 
  scale_y_continuous(name = 'Density') + 
  ggtitle(paste('Distribution of baseline sick leave days in the trial population', ifelse(CLOSED == T, " stratified per arm\np-value (Mann-Whitney-Wilcoxon) = ", ""), ifelse(CLOSED == T, round(wilcox.test(sickleave_taken_4wks ~ group, data = df_full %>% distinct(PID, group, sickleave_taken_4wks))$p.value,3), ""),  sep = '')) + 
  theme_bw() + 
  expand_limits(x = 0) + 
  facet_grid(~group)

```

### Recorded sick leave during the trial

During the monthly participant follow-up, the questionnaire asked how many days of sick leave were taken since the last contact. These days were summed per participant and shown below`r ifelse(CLOSED == T, " per arm", "")`.

`r ifelse(CLOSED == T, paste(df_full %>% filter(group == Arm1) %>% filter(sick_leave_any == 'Yes') %>% distinct(PID) %>% nrow(), " participants in the ", Arm1, " arm took sick leave, ", df_full %>% filter(sick_leave_any == 'Yes', c19_positive == 'Yes', group == Arm1) %>% distinct(PID) %>% nrow()," of which had a preceding positive COVID-19 test. ", df_full %>% filter(group == Arm2) %>% filter(sick_leave_any == 'Yes') %>% distinct(PID) %>% nrow(), " participants in the ", Arm2, " arm took sick leave, ", df_full %>% filter(sick_leave_any == 'Yes', c19_positive == 'Yes', group == Arm2) %>% distinct(PID) %>% nrow()," of which had a preceding positive COVID-19 test.", sep = ""), paste(df_full %>% filter(sick_leave_any == 'Yes') %>% distinct(PID) %>% nrow(), " participants took sick leave, ", df_full %>% filter(sick_leave_any == 'Yes', c19_positive == 'Yes') %>% distinct(PID) %>% nrow()," of which had a preceding positive COVID-19 test.", sep = ""))`

`r ifelse(CLOSED == T, paste('Mean number of sick days after a positive COVID-19 test was ', round(df_full %>% filter(sick_leave_any == 'Yes', c19_positive == 'Yes', group == Arm1)  %>% summarize(mean = mean(sick_leave_days))), ' days in the ', Arm1, ' arm, and ', round(df_full %>% filter(sick_leave_any == 'Yes', c19_positive == 'Yes', group == Arm2)  %>% summarize(mean = mean(sick_leave_days))), ' days in the ', Arm2, ' arm.', sep = ''), paste('Mean number of sick days after a positive COVID-19 test was ', round(df_full %>% filter(sick_leave_any == 'Yes', c19_positive == 'Yes') %>% summarize(mean = mean(sick_leave_days))), ' days.', sep = ''))`
`r ifelse(CLOSED == T, paste('Mean number of sick days without a positive COVID-19 test was ', round(df_full %>% filter(sick_leave_any == 'Yes', c19_positive != 'Yes', group == Arm1)  %>% summarize(mean = mean(sick_leave_days))), ' days in the ', Arm1, ' arm, and ', round(df_full %>% filter(sick_leave_any == 'Yes', c19_positive != 'Yes', group == Arm2)  %>% summarize(mean = mean(sick_leave_days))), ' days in the ', Arm2, ' arm.', sep = ''), paste('Mean number of sick days without a positive COVID-19 test was ', round(df_full %>% filter(sick_leave_any == 'Yes', c19_positive != 'Yes') %>% summarize(mean = mean(sick_leave_days))), ' days.', sep = ''))` 

```{r sick leave, eval = eval_dsmb, echo = echo_dsmb, message = F}
df_full %>% 
  filter(sick_leave_any == 'Yes') %>% 
  distinct(PID, group, visit_date_4, visit_week_id_4, sick_leave_any, sick_leave_days) %>% #select unique PIDs with multiple follow-up records to sum the sick_leave_days 
  group_by(PID, group) %>%
  summarize(sum_sickleave = sum(sick_leave_days)) %>%

ggplot(aes(sum_sickleave)) +
  geom_histogram(aes(y = ..density.., fill = factor(group)), alpha = 0.5) +
  geom_density() + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') + 
  scale_x_continuous(name = 'Sick leave (days)') + 
  scale_y_continuous(name = 'Density') + 
  expand_limits(x = 0) + 
  ggtitle(paste('Distribution of taken sick leave days in the trial population', ifelse(CLOSED == T, " stratified per arm\np-value (Mann-Whitney-Wilcoxon) = ", ""), ifelse(CLOSED == T, round(wilcox.test(sum_sickleave ~ group, data = df_full %>% 
    filter(sick_leave_any == 'Yes') %>% 
    distinct(PID, group, visit_date_4, visit_week_id_4, sick_leave_any, sick_leave_days) %>% #select unique PIDs with multiple follow-up records to sum the sick_leave_days 
    group_by(PID, group) %>%
    summarize(sum_sickleave = sum(sick_leave_days)))$p.value,3), ""),  sep = '')) + 
  theme_bw() + 
  facet_grid(~group)

```

\newpage 

# Adverse events

## Reported events

*Events are recorded in three categories: respiratory tract infection, injection site reaction, and other. For each event, a health status score was recorded. The highest health status score per event, defined by unique event number per participant, is reported in this report.*

`r n_ae` total adverse events occurred in `r part_ae` participants.

`r ifelse(CLOSED == T, paste(n_ae_rti_group1, "respiratory tract infection  events occurred in", part_ae_rti_group1, "participants in the ", Arm1, " arm, and", n_ae_rti_group2, "respiratory tract infection events occurred in", part_ae_rti_group2, "participants in the ", Arm2, " arm."), paste(n_ae_rti, "respiratory tract infection events occurred in", part_ae_rti, "participants."))`  
`r n_ae_isr` injection site reaction adverse events occurred in `r part_ae_isr` participants.  
`r ifelse(CLOSED == T, paste(n_ae_other_group1, "other adverse events occurred in", part_ae_other_group1, "participants in the ", Arm1, " arm, and",n_ae_other_group2, "other adverse events occurred in", part_ae_other_group2, "participants in the ", Arm2, " arm."), paste(n_ae_other, "other adverse events occurred in", part_ae_other, "participants"))`  

`r ifelse(CLOSED == T, paste("Of the ", n_ae_rti, " respiratory tract infection events, ", n_ae_rti_mild, " (", round(100 * n_ae_rti_mild / n_ae_rti), "%, ", ae_rti_mild %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_rti_mild %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered mild and ", n_ae_rti_moderate, " (", round(100 * n_ae_rti_moderate / n_ae_rti), "%, ", ae_rti_moderate %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_rti_moderate %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered moderate.", sep = ''), paste("Of the ", n_ae_rti, " respiratory tract infection events, ", n_ae_rti_mild, " (", round(100 * n_ae_rti_mild / n_ae_rti), "%) were considered mild and ", n_ae_rti_moderate, " (", round(100 * n_ae_rti_moderate / n_ae_rti), "%) were considered moderate.", sep = ''))` 

Of the `r n_ae_isr` injection site reaction adverse events, `r n_ae_isr_mild` (`r round(100 * n_ae_isr_mild / n_ae_isr)`\%) were considered mild and `r n_ae_isr_moderate` (`r round(100 * n_ae_isr_moderate / n_ae_isr)`\%) were considered moderate. 

`r ifelse(CLOSED == T, paste("Of the ", n_ae_other, " other adverse events, ", n_ae_other_mild, " (", round(100 * n_ae_other_mild / n_ae_other), "%, ", ae_other_mild %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_other_mild %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered mild and ", n_ae_other_moderate, " (", round(100 * n_ae_other_moderate / n_ae_other), "%, ", ae_other_moderate %>% filter(group == Arm1) %>%  nrow()," in the ", Arm1, " arm and ", ae_other_moderate %>% filter(group == Arm2) %>%  nrow()," in the ", Arm2, " arm) were considered moderate.", sep = ''),  paste("Of the ", n_ae_other, " other adverse events, ", n_ae_other_mild, " (", round(100 * n_ae_other_mild / n_ae_other), "%) were considered mild and ", n_ae_other_moderate, " (", round(100 * n_ae_other_moderate / n_ae_other), "%) were considered moderate.", sep = ''))`

`r n_sae_previous` serious adverse events were reported in the previous DSMB report.  
`r ifelse(n_sae - n_sae_previous == 0, "There have been no additional serious adverse events since the last DSMB meeting.", ifelse(n_sae - n_sae_previous == 1, paste(n_sae - n_sae_previous, "additional serious adverse events was reported since the last DSMB meeting."), paste(n_sae - n_sae_previous, "additional serious adverse events were reported since the last DSMB meeting.")))`   
`r ifelse(CLOSED == T, paste("In total,", n_sae_group1, "serious adverse events were reported in", part_sae_group1, "participants in the ", Arm1, " arm, and ", n_sae_group2, "serious adverse events were reported in", part_sae_group2, "participants in the ", Arm2, " arm."), paste("In total,", n_sae, "serious adverse events were reported in", part_sae, "participants."))`  
`r paste("SAEs were ", ifelse(n_sae_unrelated > 0, 'unrelated', ''), ifelse(n_sae_unlikelyrelated > 0, ', or unlikely related', ''), ifelse(n_sae_possiblyrelated > 0, ', or possibly related', ''), ifelse(n_sae_probablyrelated > 0, ', or probably related', ''), ifelse(n_sae_definitely > 0, ', or definitely related', ''), " to the intervention.", sep = '')`   

### Grade 2 or higher

`r ifelse(CLOSED == T, paste(n_ae_grade_234_group1, " adverse events of grade 2 or higher occured in ", part_ae_grade_234_group1, " (", round(100 * part_ae_grade_234_group1 / part_group1, 1), "%) participants in the ", Arm1, " arm, and ", n_ae_grade_234_group2, " adverse events of grade 2 or higher occured in ", part_ae_grade_234_group2, " (", round(100 * part_ae_grade_234_group2 / part_group1, 1), "%) participants in the ", Arm2, " arm.", sep = ''), paste(n_ae_grade_234, " adverse events of grade 2 or higher occured in ", part_ae_grade_234, " (", round(100 * part_ae_grade_234 / part_group1, 1), "%) participants", sep = ''))`
`r ifelse(CLOSED == T, paste('Of these, ', n_ae_grade_3_group1, " were grade 3 (in ", part_ae_grade_3_group1, " (", round(100 * part_ae_grade_3_group1 / part_group1, 1), "%) participants) and ", n_ae_grade_4_group1, " were grade 4 (in ", part_ae_grade_4_group1, " (", round(100 * part_ae_grade_4_group1 / part_group1, 1), "%) participants) in the ", Arm1, " arm, and ", n_ae_grade_3_group2, " were grade 3 (in ", part_ae_grade_3_group2, " (", round(100 * part_ae_grade_3_group2 / part_group2, 1), "%) participants) and ", n_ae_grade_4_group2, " were grade 4 (in ", part_ae_grade_4_group2, " (", round(100 * part_ae_grade_4_group2 / part_group2, 1), "%) participants) in the ", Arm2, " arm.", sep = ''), paste('Of these, ', n_ae_grade_3, " were grade 3 (in ", part_ae_grade_3, " (", round(100 * part_ae_grade_3 / part_enrol, 1), "%) participants) and ", n_ae_grade_4, " were grade 4 (in ", part_ae_grade_4, " (", round(100 * part_ae_grade_4 / part_enrol, 1), "%) participants).", sep = ''))`

`r ifelse(CLOSED == T, paste(n_ae_grade_234_ISR_group1, " injection site reaction of grade 2 or higher occured in ", part_ae_grade_234_ISR_group1, " (", round(100 * part_ae_grade_234_ISR_group1 / part_group1, 1), "%) participant in the ", Arm1, " arm, and ", n_ae_grade_234_ISR_group2, " injection site reactions of grade 2 or higher occured in ", part_ae_grade_234_ISR_group2, " (", round(100 * part_ae_grade_234_ISR_group2 / part_group1, 1), "%) participants in the ", Arm2, " arm.", sep = ''), paste(n_ae_grade_234_ISR, " injection site reactions of grade 2 or higher occured in ", part_ae_grade_234_ISR, " (", round(100 * part_ae_grade_234_ISR / part_group1, 1), "%) participants", sep = ''))`


### Intentive care admissions

`r ifelse(CLOSED == T, paste(n_ae_icu_group1, " total intensive care admissions occurred in ", part_ae_icu_group1, " participants in the ", Arm1, " arm, and ", n_ae_icu_group2, " total intensive care admissions occurred in ", part_ae_icu_group2, " participants in the ", Arm2, " arm.", sep = ''), paste(n_ae_icu, " total intensive care admissions occurred in ", part_ae_icu, "participants", sep = ''))`
`r ifelse(CLOSED == T, paste('Of these, ', nrow(c19_HS6_group1), " were due to COVID-19 in the ", Arm1, " arm, and ", nrow(c19_HS6_group2), " in the ", Arm2, " arm.", sep = ''), paste('Of these, ', nrow(c19_HS6), ' were due to COVID-19', sep = ''))`

*More details are described per event type below. Full details of the adverse events can be found in [Appendix B][Respiratory tract infection events (full details)]*.

## Health status score

Every adverse event received a health status score (see [Health status definitions][Health status] for their definitions). Health status scores had a weekly time resolution, within which interval the highest experienced health status was recorded. 

\newpage

## Respiratory tract infection events 

MedDRA lowest level terms were used to descibe events, see for the full definitions [Appendix I][Appendix I: MedDRA]. Definition of event seriousness followed recorded corresponding health status (see [Health status definitions][Health status]).

| Classification | Recorded events | Event details | 
|--- | --- | ------------ |
|**Total events**| `r n_ae_rti` RTI events in `r part_ae_rti` participants. |
|||
|**Healthy events**| `r n_ae_rti_healthy` healthy RTI events in `r part_ae_rti_healthy` participants. | `r if(CLOSED == F){unique(ae_rti_healthy$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_healthy[ae_rti_healthy$group == Arm1,]), " healthy RTI events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_healthy$LLT[ae_rti_healthy$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_healthy[ae_rti_healthy$group == Arm2,]), " healthy RTI events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_healthy$LLT[ae_rti_healthy$group == Arm2])}` |
|||
|**Mild events**| `r n_ae_rti_mild` mild RTI events in `r part_ae_rti_mild` participants. | `r if(CLOSED == F){unique(ae_rti_mild$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_mild[ae_rti_mild$group == Arm1,]), " mild RTI events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_mild$LLT[ae_rti_mild$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_mild[ae_rti_mild$group == Arm2,]), " mild RTI events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_mild$LLT[ae_rti_mild$group == Arm2])}` |
|||
|**Moderate events**| `r n_ae_rti_moderate` moderate RTI events in `r part_ae_rti_moderate` participants. | `r if(CLOSED == F){unique(ae_rti_moderate$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_moderate[ae_rti_moderate$group == Arm1,]), " moderate RTI events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_moderate$LLT[ae_rti_moderate$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_moderate[ae_rti_moderate$group == Arm2,]), " moderate RTI events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_moderate$LLT[ae_rti_moderate$group == Arm2])}` |
|||
|**Severe events**| `r n_ae_rti_severe` severe RTI events in `r part_ae_rti_severe` participants. | `r if(CLOSED == F){unique(ae_rti_severe$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_severe[ae_rti_severe$group == Arm1,]), " severe RTI events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_severe$LLT[ae_rti_severe$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_severe[ae_rti_severe$group == Arm2,]), " severe RTI events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_severe$LLT[ae_rti_severe$group == Arm2])}` |
|||
|**Hospitalization**| `r n_ae_rti_hosp` RTI events leading to hospitalization in `r part_ae_rti_hosp` participants. | `r if(CLOSED == F){unique(ae_rti_hosp$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_hosp[ae_rti_hosp$group == Arm1,]), " RTI events leading to hospitalization in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_hosp$LLT[ae_rti_hosp$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_hosp[ae_rti_hosp$group == Arm2,]), " RTI events leading to hospitalization in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_hosp$LLT[ae_rti_hosp$group == Arm2])}` |
|||
|**Hospitalization with oxygen**| `r n_ae_rti_hosp_oxy` RTI events leading to hospitalization with oxygen in `r part_ae_rti_hosp_oxy` participants. | `r if(CLOSED == F){unique(ae_rti_hosp_oxy$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_hosp_oxy[ae_rti_hosp_oxy$group == Arm1,]), " RTI events leading to hospitalization with oxygen in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_hosp_oxy$LLT[ae_rti_hosp_oxy$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_hosp_oxy[ae_rti_hosp_oxy$group == Arm2,]), " RTI events leading to hospitalization with oxygen in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_hosp_oxy$LLT[ae_rti_hosp_oxy$group == Arm2])}` |
|||
|**Hospitalization with ventilation**| `r n_ae_rti_hosp_vent` RTI event leading to hospitalization with ventilation in `r part_ae_rti_hosp_vent` participant. | `r if(CLOSED == F){unique(ae_rti_hosp_vent$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_hosp_vent[ae_rti_hosp_vent$group == Arm1,]), " RTI event leading to hospitalization with ventilation in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_hosp_vent$LLT[ae_rti_hosp_vent$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_hosp_vent[ae_rti_hosp_vent$group == Arm2,]), " RTI event leading to hospitalization with ventilation in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_hosp_vent$LLT[ae_rti_hosp_vent$group == Arm2])}` |
|||
|**Death**| `r n_ae_rti_dead` fatal RTI event in `r part_ae_rti_dead` participant. | `r if(CLOSED == F){unique(ae_rti_dead$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_dead[ae_rti_dead$group == Arm1,]), " fatal RTI events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_dead$LLT[ae_rti_dead$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_rti_dead[ae_rti_dead$group == Arm2,]), " fatal RTI events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_rti_dead$LLT[ae_rti_dead$group == Arm2])}` |

```{r rti per site, eval = eval_dsmb, echo = echo_dsmb, message = F}
df_full %>% #the full event dataset including PID
  filter(report_date != '') %>% #report_date is the input column from the event_rti datasets
  arrange(desc(event_HS)) %>% #select highest event HS
  distinct(PID, event_number, .keep_all = T) %>% 
  group_by(site) %>% 
  summarize(RTIs = n()) %>% #get RTIs per site
  full_join(c19_event %>% 
              group_by(site) %>% 
               summarize(COVID19 = n()), by = 'site') %>% 
  mutate('Of which COVID19' = ifelse(is.na(COVID19), 0, COVID19)) %>% #replace NA by zeros
  select(!COVID19) %>% 
  kable() %>%  
  kable_styling(position = 'center', latex_options = 'HOLD_position') %>% 
  row_spec(0, bold = T, hline_after = T)


```


\newpage

## Injection site reaction events 

<!-- **Injection site reaction (ISR) events are not reported for the two arm separately to prevent the risk of unblinding due to the high incidence of ISRs as result of BCG vaccination in comparison to the placebo injection.**  -->

MedDRA lowest level terms were used to descibe events, see for the full definitions [Appendix I][Appendix I: MedDRA]. Definition of event seriousness followed recorded corresponding health status (see [Health status definitions][Health status]).

| Classification | Recorded events | Event details | 
|--- | --- | ------ |
|**Total events**| `r n_ae_isr` ISR events in `r part_ae_isr` participants. |
|||
|**Healthy events**| `r n_ae_isr_healthy` healthy ISR events in `r part_ae_isr_healthy` participants. | `r unique(ae_isr_healthy$LLT)`
|||
|**Mild events**| `r n_ae_isr_mild` mild ISR events in `r part_ae_isr_mild` participants. | `r unique(ae_isr_mild$LLT)`
|||
|**Moderate events**| `r n_ae_isr_moderate` moderate ISR events in `r part_ae_isr_moderate` participants. | `r unique(ae_isr_moderate$LLT)`
|||
|**Severe events**| `r n_ae_isr_severe` severe ISR events in `r part_ae_isr_severe` participants. | `r unique(ae_isr_severe$LLT)`
|||
|**Hospitalization**| `r n_ae_isr_hosp` ISR events leading to hospitalization in `r part_ae_isr_hosp` participants. | `r unique(ae_isr_hosp$LLT)`
|||
|**Hospitalization with oxygen**| `r n_ae_isr_hosp_oxy` ISR events leading to hospitalization with oxygen in `r part_ae_isr_hosp_oxy` participants. | `r unique(ae_isr_hosp_oxy$LLT)`
|||
|**Hospitalization with ventilation**| `r n_ae_isr_hosp_vent` ISR events leading to hospitalization with ventilation in `r part_ae_isr_hosp_vent` participants. | `r unique(ae_isr_hosp_vent$LLT)`
|||
|**Death**| `r n_ae_isr_dead` fatal ISR events in `r part_ae_isr_dead` participants. | `r unique(ae_isr_dead$LLT)`


\newpage

## Other events 

MedDRA lowest level terms were used to descibe events, see for the full definitions [Appendix I][Appendix I: MedDRA]. Definition of event seriousness followed recorded corresponding health status (see [Health status definitions][Health status]).

| Classification | Recorded events | Event details | 
|--- | --- | ------------ |
|**Total events**| `r n_ae_other` other events in `r part_ae_other` participants. |
|||
|**Healthy events**| `r n_ae_other_healthy` healthy other events in `r part_ae_other_healthy` participants. | `r if(CLOSED == F){unique(ae_other_healthy$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_healthy[ae_other_healthy$group == Arm1,]), " healthy other events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_mild$LLT[ae_other_healthy$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_healthy[ae_other_healthy$group == Arm2,]), " healthy other events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_mild$LLT[ae_other_healthy$group == Arm2])}` |
|||
|**Mild events**| `r n_ae_other_mild` mild other events in `r part_ae_other_mild` participants. | `r if(CLOSED == F){unique(ae_other_mild$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_mild[ae_other_mild$group == Arm1,]), " mild other events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_mild$LLT[ae_other_mild$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_mild[ae_other_mild$group == Arm2,]), " mild other events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_mild$LLT[ae_other_mild$group == Arm2])}` |
|||
|**Moderate events**| `r n_ae_other_moderate` moderate other events in `r part_ae_other_moderate` participants. | `r if(CLOSED == F){unique(ae_other_moderate$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_moderate[ae_other_moderate$group == Arm1,]), " moderate other events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_moderate$LLT[ae_other_moderate$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_moderate[ae_other_moderate$group == Arm2,]), " moderate other events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_moderate$LLT[ae_other_moderate$group == Arm2])}` |
|||
|**Severe events**| `r n_ae_other_severe` severe other events in `r part_ae_other_severe` participants. | `r if(CLOSED == F){unique(ae_other_severe$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_severe[ae_other_severe$group == Arm1,]), " severe other events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_severe$LLT[ae_other_severe$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_severe[ae_other_severe$group == Arm2,]), " severe other events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_severe$LLT[ae_other_severe$group == Arm2])}` |
|||
|**Hospitalization**| `r n_ae_other_hosp` other events leading to hospitalization in `r part_ae_other_hosp` participants. | `r if(CLOSED == F){unique(ae_other_hosp$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_hosp[ae_other_hosp$group == Arm1,]), " other events leading to hospitalization in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_hosp$LLT[ae_other_hosp$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_hosp[ae_other_hosp$group == Arm2,]), " other events leading to hospitalization in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_hosp$LLT[ae_other_hosp$group == Arm2])}` |
|||
|**Hospitalization with oxygen**| `r n_ae_other_hosp_oxy` other event leading to hospitalization with oxygen in `r part_ae_other_hosp_oxy` participant. | `r if(CLOSED == F){unique(ae_other_hosp_oxy$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_hosp_oxy[ae_other_hosp_oxy$group == Arm1,]), " other event leading to hospitalization with oxygen in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_hosp_oxy$LLT[ae_other_hosp_oxy$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_hosp_oxy[ae_other_hosp_oxy$group == Arm2,]), " other event leading to hospitalization with oxygen in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_hosp_oxy$LLT[ae_other_hosp_oxy$group == Arm2])}` |
|||
|**Hospitalization with ventilation**| `r n_ae_other_hosp_vent` other events leading to hospitalization with ventilation in `r part_ae_other_hosp_vent` participants. | `r if(CLOSED == F){unique(ae_other_hosp_vent$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_hosp_vent[ae_other_hosp_vent$group == Arm1,]), " other events leading to hospitalization with ventilation in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_hosp_vent$LLT[ae_other_hosp_vent$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_hosp_vent[ae_other_hosp_vent$group == Arm2,]), " other events leading to hospitalization with ventilation in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_hosp_vent$LLT[ae_other_hosp_vent$group == Arm2])}` |
|||
|**Death**| `r n_ae_other_dead` fatal other events in `r part_ae_other_dead` participants. | `r if(CLOSED == F){unique(ae_other_dead$LLT)}`
|||
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_dead[ae_other_dead$group == Arm1,]), " fatal other events in the ", Arm1, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_dead$LLT[ae_other_dead$group == Arm1])}` |
|| `r ifelse(CLOSED == T, paste(nrow(ae_other_dead[ae_other_dead$group == Arm2,]), " fatal other events in the ", Arm2, " arm", sep = ""), "")` | `r if(CLOSED == T){unique(ae_other_dead$LLT[ae_other_dead$group == Arm2])}` |

\newpage

## Serious adverse events (SAE)

| Classification | Recorded events | Causality | Event details | 
|--- | --- | --- | ------ |
|**Total SAEs**| `r n_sae` SAEs in `r part_sae` participants. | `r if(CLOSED == F){  paste(  ifelse(n_sae_unrelated > 0, paste(n_sae_unrelated, 'events unrelated to IP. '), ''),  ifelse(n_sae_unlikelyrelated > 0, paste(n_sae_unlikelyrelated, 'event unlikely related to IP. '), ''),  ifelse(n_sae_possiblyrelated > 0, paste(n_sae_possiblyrelated, 'events possibly related to IP. '), ''),  ifelse(n_sae_probablyrelated > 0, paste(n_sae_probablyrelated, 'events probably related to IP. '), ''),  ifelse(n_sae_definitely > 0, paste(n_sae_definitely, 'events definitely related to IP. '), ''),   sep = ''  )}` |`r if(CLOSED == F){sae$LLT[order(sae$LLT)]}`
|||
|`r ifelse(CLOSED == T, "**SAEs per arm**", "")`| `r ifelse(CLOSED == T, paste(nrow(sae[sae$group == Arm1,]), " SAEs in the ", Arm1, " arm.", sep = ''), "")` | `r if(CLOSED == T){  paste(ifelse(n_sae_unrelated_group1 > 0, paste(n_sae_unrelated_group1, 'events unrelated to IP. '), ''),  ifelse(n_sae_unlikelyrelated_group1 > 0, paste(n_sae_unlikelyrelated_group1, 'events unlikely related to IP. '), ''),  ifelse(n_sae_possiblyrelated_group1 > 0, paste(n_sae_possiblyrelated_group1, 'events possibly related to IP. '), ''),  ifelse(n_sae_probablyrelated_group1 > 0, paste(n_sae_probablyrelated_group1, 'events probably related to IP. '), ''),  ifelse(n_sae_definitely_group1 > 0, paste(n_sae_definitely_group1, 'events definitely related to IP. '), ''),   sep = ''  )}` |`r if(CLOSED == T){sae$LLT[sae$group == Arm1][order(sae$LLT[sae$group == Arm1])]}` |
|| `r ifelse(CLOSED == T, paste(nrow(sae[sae$group == Arm2,]), " SAEs in the ", Arm2, " arm.", sep = ''), "")` | `r if(CLOSED == T){  paste(ifelse(n_sae_unrelated_group2 > 0, paste(n_sae_unrelated_group2, 'events unrelated to IP. '), ''),  ifelse(n_sae_unlikelyrelated_group2 > 0, paste(n_sae_unlikelyrelated_group2, 'events unlikely related to IP. '), ''),  ifelse(n_sae_possiblyrelated_group2 > 0, paste(n_sae_possiblyrelated_group2, 'events possibly related to IP. '), ''),  ifelse(n_sae_probablyrelated_group2 > 0, paste(n_sae_probablyrelated_group2, 'events probably related to IP. '), ''),  ifelse(n_sae_definitely_group2 > 0, paste(n_sae_definitely_group2, 'events definitely related to IP. '), ''),   sep = ''  )}` | `r if(CLOSED == T){sae$LLT[sae$group == Arm2][order(sae$LLT[sae$group == Arm2])]}` |

*Full details of the adverse events can be found in [Appendix B][Serious adverse events (full details)]*.

### Serious adverse events per site

```{r sae per site, eval = eval_dsmb, echo = echo_dsmb, message = F}
df_sae %>% 
  group_by(site) %>% 
  summarize(SAEs = n()) %>% #get SAEs per site
  full_join( df_sae %>% 
               filter(grepl(paste(spelling$event_name, collapse = '|'), event_name)) %>%  # get COVID19 related SAEs per site
               group_by(site) %>% 
               summarize(COVID19 = n()), by = 'site') %>% 
  mutate('Of which COVID19' = ifelse(is.na(COVID19), 0, COVID19)) %>% #replace NA by zeros
  select(!COVID19) %>% 
  kable() %>%  
  kable_styling(position = 'center', latex_options = 'HOLD_position') %>% 
  row_spec(0, bold = T, hline_after = T)


```

In the modelling approach, all relevant risk factors will be tested. Afterwards, site can be tested as well, but the signal will probably be accounted for by the risk factors already.

*Full risk factors per site can be found in [Appendix J][Appendix J: Risk factors per site].*

## Deaths

`r n_deaths` total fatal events`r ifelse(CLOSED == F, paste(", with causes of deaths:", sep = ""), "")``r if(CLOSED == F){unique(ae_dead$LLT)}`. 

`r ifelse(CLOSED == T, paste(nrow(ae_dead[ae_dead$group == Arm1,]), " fatal events in the ", Arm1, " arm, with cause of deaths: ", sep = ""), "")``r if(CLOSED == T){unique(ae_dead$LLT[ae_dead$group == Arm1])}``r if(CLOSED == T){'.'}` 

`r ifelse(CLOSED == T, ifelse(nrow(ae_dead[ae_dead$group == Arm2,]) <1, paste("No fatal events in the ", Arm2, " arm", sep = ''), paste(nrow(ae_dead[ae_dead$group == Arm2,]), " fatal events in the ", Arm2, " arm, with cause of deaths: ", sep = "")), "")``r if(CLOSED == T){unique(ae_dead$LLT[ae_dead$group == Arm2])}``r if(CLOSED == T){'.'}`  

### Risk factors

In the table below, the demographics and risk factors of the participants that suffered a fatal event (as identified by LLT) are shown. Cells are empty in case of no data available.

`r if(CLOSED == T) {details_deaths %>% select(-PID) %>% t() %>% kable()} else {details_deaths %>% select(-PID, -group) %>% t() %>% kable()}`


\newpage

# Protocol deviations

`r n_deviations` protocol deviations associated with `r part_deviations` participants were reported.

`r ifelse(n_deviations_safety == 0, "None of the deviations impacted participant safety.", paste(n_deviations_integrity, "of the deviations impacted participant safety."))`

`r ifelse(n_deviations_integrity == 0, "None of the deviations impacted scientific integrity.", paste(n_deviations_integrity, "of the deviations impacted scientific integrity."))`

## Adherence

*A risk in the trial was the lack of adherence, when health care workers might get their own BCG vaccination once it would become clear it was efficacious.*

`r ifelse(CLOSED == T, paste(part_vaccine_group1, " (", round(100 * part_vaccine_group1 / part_group1, 1), "%) participants recorded a flu, BCG, or other (non-COVID-19) vaccination since the start of the trial in the ", Arm1, " arm.  \n", part_vaccine_group2, " (", round(100 * part_vaccine_group2 / part_group2, 1), "%) participants recorded a flu, BCG, or other (non-COVID-19) vaccination since the start of the trial in the ", Arm2, " arm.", sep = ''), paste(part_vaccine, " (", round(100 * part_vaccine / part_enrol, 1), "%) participants recorded a flu, BCG, or other (non-COVID-19) vaccination since the start of the trial.", sep = ''))`

## Drop-out

`r ifelse(CLOSED == T, paste(part_discont, " participants were discontinued (withdrawn), ", part_discont_group1, " in the ", Arm1, " arm and ", part_discont_group2, " in the ", Arm2, " arm.", sep = ""), paste(part_discont, " participants were discontinued (withdrawn).", sep = ""))`
`r ifelse(CLOSED == T, paste("Of those, ", part_discont_losttofu, " participants were lost to follow-up, ", part_discont_losttofu_group1, " in the ", Arm1, " arm and ", part_discont_losttofu_group2, " in the ", Arm2, " arm, and ", part_discont - part_discont_losttofu, " participants lost interest, ", part_discont_group1 - part_discont_losttofu_group1, " in the ", Arm1, " arm and ", part_discont_group2 - part_discont_losttofu_group2, " in the ", Arm2, " arm.", sep = ""), paste("Of those, ", part_discont_losttofu, " participants were lost to follow-up, and ", part_discont - part_discont_losttofu, " participants lost interest.", sep = ""))`
*A drop-out model could be considered when drop-out was substantial and informative.*

\newpage

# Quality management data

`r n_qc` records were subject of quality review, which took place on `r qc_date`.

In `r n_qc_yes` of the reviewed data entries (`r round(100* n_qc_yes/n_qc)`\%), correction was needed.

\newpage

# Quality control data analysis

This script were QC'ed by `r reviewer_data_analysis` on `r date_last_review_data_analysis`:

The QC was documented in the QC report, no major findings were reported.

## Data Definition Table primary endpoint

For the Cox proportional hazard model, the following data structure was used:

| **Data item** | **Definition** | **Possible values** | **Unit** | **Data source** | 
| ----- | ----- | --- | - | ----- | 
| **PID** | Personal identifier | BCGxxxx (x = numerical) | - | enrolment_randomisation.dat; participant_ID.dat |
| **group** | Study arm | 1 or 2 | - | group.dat |
| **time** | Time of recorded event | 0 (onset) - 52 (end of trial) | Week | event_isr.dat; event_isr_other_fu.dat; event_other.dat; event_rti.dat; event_rti_cont.dat; event_rti_fu.dat | 
| **status** | Hospitalization of participant ($HS \geq 4$) | 0 or 1 | - | event_isr.dat; event_isr_other_fu.dat; event_other.dat; event_rti.dat; event_rti_cont.dat; event_rti_fu.dat |
| **age** | Age of participant as calculated from date of birth | $\geq 18$ | Year | demographics_contact.dat | 
| **gender** | Gender of participant | Male, Female  | - | demographics_contact.dat | 
| **BMI** | Body mass index of participant | $\geq 15$  | $\mathrm{kg/m^2}$ | vital_signs.dat | 
| **ethnicity** | Ethnicity of participant | `r unique(df_full$ethnicity)`  | - | demographics_contact.dat | 
| **job_category** | Job category of participant | `r unique(df_full$job_category)`  | - | exposure_assessment.dat | 
| **medhis_dm** | Medical history of diabetes mellitus | Yes, No | - | medical_history.dat | 
| **medhis_hyptens** | Medical history of hypertension | Yes, No | - | medical_history.dat | 
| **medhis_cvd** | Medical history of cardiovascular diseases | Yes, No | - | medical_history.dat |
| **medhis_kd** | Medical history of kidney disease | Yes, No | - | medical_history.dat |
| **medhis_asthma** | Medical history of asthma | Yes, No | - | medical_history.dat |
| **medhis_copd** | Medical history of chronic obstructive pulmonary disease | Yes, No | - | medical_history.dat |
| **medhis_otherlung** | Medical history of other lung conditions | Yes, No | - | medical_history.dat |
| **bcg_scar** | Scar of previous BCG vaccination | Yes, No | - | meds.dat |
| **pack_years** | Smoking habit reported by participant | $\geq 0$ |  packs of cigarettes/year | social_history.dat |
| **site** | Clinical site | Central, Eden, UCT |  - | enrolment_randomisation.dat; participant_ID.dat (based on PID, 1-450 Central, 451-500 Eden, 5000-5500 UCT) |
| **serobase** | SARS-CoV-2 serology status at baseline | Positive, negative, equivocal, no result |  - | serology.dat |
| **censor** | Censoring due to loss of follow-up or withdrawal of consent | 0 or 1 |  - | serology.dat |
| **expect_interact** | Expectation of the participant of interaction with COVID-19 positive patients | Yes, No |  - | exposure_assessment.dat |


## R version:

`r version$version.string`

### Packages:

#### KableExtra

Version `r packageVersion('kableExtra')`
```{r citation packages 1, echo = echo_dsmb, eval = eval_dsmb, comment = ''}
print(citation('kableExtra'), bibtex = F)
```

#### Knitr

Version `r packageVersion('knitr')`
```{r citation packages 2, echo = echo_dsmb, eval = eval_dsmb, comment = ''}
print(citation('knitr'), bibtex = F)
```

#### Survival

Version `r packageVersion('survival')`
```{r citation packages 4, echo = echo_dsmb, eval = eval_dsmb, comment = ''}
print(citation('survival'), bibtex = F)
```

#### Survminer

Version `r packageVersion('survminer')`
```{r citation packages 5, echo = echo_dsmb, eval = eval_dsmb, comment = ''}
print(citation('survminer') , bibtex = F)
```

#### Tidyverse

Version `r packageVersion('tidyverse')`
```{r citation packages 6, echo = echo_dsmb, eval = eval_dsmb, comment = ''}
print(citation('tidyverse'), bibtex = F)
```

#### Viridis

Version `r packageVersion('viridis')`
```{r citation packages 7, echo = echo_dsmb, eval = eval_dsmb, comment = ''}
print(citation('viridis'), bibtex = F)
```

\newpage

# Appendix A: Power calculation

This document provides preliminary sample size calculations for a clinical trial into vaccination against SARS-CoV-2 to prevent COVID-19 in health care workers. 

### Assumptions

- Attack rate (probability of getting infected) was assumed to be 30-80\% which was highly variable  
    - [9-12\% was reported by the international council of nurses](https://www.icn.ch/news/icn-tells-bbc-world-news-viewers-rising-rate-covid-19-infection-amongst-health-workers)
    - [30\% was reported for close contacts](https://doi.org/10.1016/S0140-6736(20)30462-1)
    - [80\% was reported for unmitigated epidemics](https://doi.org/10.1016/S1473-3099(20)30243-7)
    - These reports are based on tests on symptomatic patients and the likely number might be 5-10 fold higher
- Healthcare workers use personal protective equipment (PPE) and would be on the lower end of this rate
- Of those infected, 20\% would likely be hospitalised (15\% severe, 5\% critical following reporting in Wuhan)
- A reduction of 30\% in these numbers would be considered significant
- Primary endpoint was the proportion of participants hospitalised

### Estimate informed by literature references 

The power calculation resulted in a sample size of 220 participants per arm assuming an attack rate in health care workers of 30%, hospitalisation rate of 20% and a reduction by vaccination of 75%. These assumptions were based on [Liu et al (2020)](https://doi.org/10.1016/S0140-6736(20)30462-1), the case severity reporting from Wuhan, and the reduction in respiratory infections by BCG previously reported by [Wardhana et al (2011)](https://pubmed.ncbi.nlm.nih.gov/21979284/) and [Nemes et al (2018)](https://doi.org/10.1056/nejmoa1714021), respectively.

### Sample size  

Using the assumptions stated above, the **sample size per study arm** ranged between `r ceiling(power.prop.test(p1 = 0.8*0.2, p2 = 0.8*0.2*0.70, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)` for 80\% attack or infection rate to `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)` for 30\% attack or infection rate to even `r ceiling(power.prop.test(p1 = 0.1*0.2, p2 = 0.1*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)` for 10\% attack or infection rate. All scenarios are shown in the tables below, with varying attack or infection rate, hospitalisation rate, and reduction by the vaccine.

### Power calculations

This clinical trial had a proportional endpoint and power calculations to determine sample size were performed for different scenarios based on the stated assumptions. The group size increased with a smaller vaccine effect, which would result in a smaller difference between the test and control group. It was therefore important to have a realistic estimate of the difference. The group size also increased with a decreasing attack or infection rate of health care professionals (the more rare an event, the larger the group to observe it). 

For a two-sample test (1 test, 1 control) for proportions, a power calculation based on the Pearson chi-square test with continuity correction (for smaller sample sizes) was performed. Here, a sample size calculation for two sample populations with a proportion of 80% positive (control) and 50% reduction (test) with $\alpha = 0.05$ and $\beta = 0.2$ (i.e. 80% power) and a one-sided test was performed.

These scenarios did not take into account a drop-out rate, which should be considered for the final trial design.

### Table of sample size for 10\% hospitalisation

Attack rate | Hospitalisation rate | Reduction by vaccination | Sample size per arm
-----------|-----------|--------------|-------------:
80\% | 10\% | 75\% | `r ceiling(power.prop.test(p1 = 0.8*0.1, p2 = 0.8*0.1*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 10\% | 50\% | `r ceiling(power.prop.test(p1 = 0.8*0.1, p2 = 0.8*0.1*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 10\% | 30\% | `r ceiling(power.prop.test(p1 = 0.8*0.1, p2 = 0.8*0.1*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 10\% | 10\% | `r ceiling(power.prop.test(p1 = 0.8*0.1, p2 = 0.8*0.1*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 10\% | 75\% | `r ceiling(power.prop.test(p1 = 0.5*0.1, p2 = 0.5*0.1*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 10\% | 50\% | `r ceiling(power.prop.test(p1 = 0.5*0.1, p2 = 0.5*0.1*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 10\% | 30\% | `r ceiling(power.prop.test(p1 = 0.5*0.1, p2 = 0.5*0.1*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 10\% | 10\% | `r ceiling(power.prop.test(p1 = 0.5*0.1, p2 = 0.5*0.1*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 75\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 55\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.45, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 50\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 45\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.55, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 33\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.67, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 30\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 10\% | 10\% | `r ceiling(power.prop.test(p1 = 0.3*0.1, p2 = 0.3*0.1*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 10\% | 75\% | `r ceiling(power.prop.test(p1 = 0.2*0.1, p2 = 0.2*0.1*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 10\% | 50\% | `r ceiling(power.prop.test(p1 = 0.2*0.1, p2 = 0.2*0.1*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 10\% | 30\% | `r ceiling(power.prop.test(p1 = 0.2*0.1, p2 = 0.2*0.1*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 10\% | 10\% | `r ceiling(power.prop.test(p1 = 0.2*0.1, p2 = 0.2*0.1*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 10\% | 75\% | `r ceiling(power.prop.test(p1 = 0.1*0.1, p2 = 0.1*0.1*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 10\% | 50\% | `r ceiling(power.prop.test(p1 = 0.1*0.1, p2 = 0.1*0.1*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 10\% | 30\% | `r ceiling(power.prop.test(p1 = 0.1*0.1, p2 = 0.1*0.1*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 10\% | 10\% | `r ceiling(power.prop.test(p1 = 0.1*0.1, p2 = 0.1*0.1*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`

### Table of sample size for 20\% hospitalisation

Attack rate | Hospitalisation rate | Reduction by vaccination | Sample size per arm
-----------|-----------|--------------|-------------:
80\% | 20\% | 75\% | `r ceiling(power.prop.test(p1 = 0.8*0.2, p2 = 0.8*0.2*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 20\% | 50\% | `r ceiling(power.prop.test(p1 = 0.8*0.2, p2 = 0.8*0.2*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 20\% | 30\% | `r ceiling(power.prop.test(p1 = 0.8*0.2, p2 = 0.8*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 20\% | 10\% | `r ceiling(power.prop.test(p1 = 0.8*0.2, p2 = 0.8*0.2*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 20\% | 75\% | `r ceiling(power.prop.test(p1 = 0.5*0.2, p2 = 0.5*0.2*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 20\% | 50\% | `r ceiling(power.prop.test(p1 = 0.5*0.2, p2 = 0.5*0.2*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 20\% | 30\% | `r ceiling(power.prop.test(p1 = 0.5*0.2, p2 = 0.5*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 20\% | 10\% | `r ceiling(power.prop.test(p1 = 0.5*0.2, p2 = 0.5*0.2*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 20\% | 75\% | `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 20\% | 55\% | `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.45, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 20\% | 50\% | `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 20\% | 40\% | `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.6, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 20\% | 30\% | `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 20\% | 10\% | `r ceiling(power.prop.test(p1 = 0.3*0.2, p2 = 0.3*0.2*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 20\% | 75\% | `r ceiling(power.prop.test(p1 = 0.2*0.2, p2 = 0.2*0.2*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 20\% | 50\% | `r ceiling(power.prop.test(p1 = 0.2*0.2, p2 = 0.2*0.2*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 20\% | 30\% | `r ceiling(power.prop.test(p1 = 0.2*0.2, p2 = 0.2*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 20\% | 10\% | `r ceiling(power.prop.test(p1 = 0.2*0.2, p2 = 0.2*0.2*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 20\% | 75\% | `r ceiling(power.prop.test(p1 = 0.1*0.2, p2 = 0.1*0.2*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 20\% | 50\% | `r ceiling(power.prop.test(p1 = 0.1*0.2, p2 = 0.1*0.2*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 20\% | 30\% | `r ceiling(power.prop.test(p1 = 0.1*0.2, p2 = 0.1*0.2*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 20\% | 10\% | `r ceiling(power.prop.test(p1 = 0.1*0.2, p2 = 0.1*0.2*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`

### Table of sample size for 30\% hospitalisation

Attack rate | Hospitalisation rate | Reduction by vaccination | Sample size per arm
-----------|-----------|--------------|-------------:
80\% | 30\% | 75\% | `r ceiling(power.prop.test(p1 = 0.8*0.3, p2 = 0.8*0.3*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 30\% | 50\% | `r ceiling(power.prop.test(p1 = 0.8*0.3, p2 = 0.8*0.3*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 30\% | 30\% | `r ceiling(power.prop.test(p1 = 0.8*0.3, p2 = 0.8*0.3*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
80\% | 30\% | 10\% | `r ceiling(power.prop.test(p1 = 0.8*0.3, p2 = 0.8*0.3*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 30\% | 75\% | `r ceiling(power.prop.test(p1 = 0.5*0.3, p2 = 0.5*0.3*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 30\% | 50\% | `r ceiling(power.prop.test(p1 = 0.5*0.3, p2 = 0.5*0.3*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 30\% | 30\% | `r ceiling(power.prop.test(p1 = 0.5*0.3, p2 = 0.5*0.3*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
50\% | 30\% | 10\% | `r ceiling(power.prop.test(p1 = 0.5*0.3, p2 = 0.5*0.3*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 30\% | 75\% | `r ceiling(power.prop.test(p1 = 0.3*0.3, p2 = 0.3*0.3*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 30\% | 50\% | `r ceiling(power.prop.test(p1 = 0.3*0.3, p2 = 0.3*0.3*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 30\% | 45\% | `r ceiling(power.prop.test(p1 = 0.3*0.3, p2 = 0.3*0.3*0.55, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 30\% | 33\% | `r ceiling(power.prop.test(p1 = 0.3*0.3, p2 = 0.3*0.3*0.67, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 30\% | 30\% | `r ceiling(power.prop.test(p1 = 0.3*0.3, p2 = 0.3*0.3*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
30\% | 30\% | 10\% | `r ceiling(power.prop.test(p1 = 0.3*0.3, p2 = 0.3*0.3*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 30\% | 75\% | `r ceiling(power.prop.test(p1 = 0.2*0.3, p2 = 0.2*0.3*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 30\% | 50\% | `r ceiling(power.prop.test(p1 = 0.2*0.3, p2 = 0.2*0.3*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 30\% | 30\% | `r ceiling(power.prop.test(p1 = 0.2*0.3, p2 = 0.2*0.3*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
20\% | 30\% | 10\% | `r ceiling(power.prop.test(p1 = 0.2*0.3, p2 = 0.2*0.3*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 30\% | 75\% | `r ceiling(power.prop.test(p1 = 0.1*0.3, p2 = 0.1*0.3*0.25, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 30\% | 50\% | `r ceiling(power.prop.test(p1 = 0.1*0.3, p2 = 0.1*0.3*0.5, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 30\% | 30\% | `r ceiling(power.prop.test(p1 = 0.1*0.3, p2 = 0.1*0.3*0.7, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`
10\% | 30\% | 10\% | `r ceiling(power.prop.test(p1 = 0.1*0.3, p2 = 0.1*0.3*0.9, sig.level = 0.05, power = 0.8, n = NULL, alternative = 'one.sided')$n)`

\newpage

# Appendix B: Additional summary tables

### Missing variables

In the datasets, **99.9** was default for missing numerical decimal fields, and **999** was default for missing numerical integer fields. 
The datasets were checked for these values at input. If necessary, values were replaced appropriately with justification.

`r ifelse(nrow(df_missing_data_dec) == 0, 'No missing numerical decimal fields.', kable(df_missing_data_dec))`
`r ifelse(nrow(df_missing_data_int) == 0, 'No missing numerical integer fields.', kable(df_missing_data_int))`

### Numerical demographics

Missing values (see [above][Missing variables]) are reported as NA in the `r ifelse(CLOSED == T, 'tables', 'table')` below.

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''),  "")`
```{r numerical demographics table group1, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(group == Arm1) %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''),  "")`
```{r numerical demographics table group2, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical %>%
  filter(group == Arm2) %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

```{r numerical demographics table, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

### Binary demographics

Empty records are assigned either '' (blank) or NA.

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''),  "")`
```{r binary demographics table group1, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm1) %>%
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''),  "")`
```{r binary demographics table group2, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm2) %>%
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

```{r binary demographics table, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

### Categorical demographics

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''),  "")`
```{r categorical demographics table group1, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm1) %>%
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''),  "")`
```{r categorical demographics table group2, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm2) %>%
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

```{r categorical demographics table, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

\newpage

### Work related demographics, including expected exposure to COVID-19

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''),  "")`
```{r work demographics table group1, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm1) %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position')) #scale down to fit page width
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''),  "")`
```{r work demographics table group2, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm2) %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position')) #scale down to fit page width
```

```{r work demographics table, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

*Legend work_hours: 1_40 means 1-40 hour/week, 41_80 means 41-80 hour/week, more_than_80 means more than 80 hour/week.*

\newpage 

### Respiratory tract infection events (full details)

```{r events RTI details, eval = eval_dsmb, echo = echo_dsmb, results = 'asis'}

df2_event_rti_comb%>% 
  filter(event_sae == 'No') %>% #separate header for SAEs
  select(no, report_date, event_start, aware_date, event_hs, event_causality, event_status, event_ongoing, event_stop) %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))

```  


\newpage 

### Injection site reaction events (full details)

```{r events ISR details, eval = eval_dsmb, echo = echo_dsmb}
df2_event_isr %>%
  select(no, report_date_1, event_start_1, event_hs_1, event_causality_1, event_status_1, event_ongoing_1, event_stop_1) %>%
  filter(report_date_1 != '' ) %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

### Other events (full details)

```{r events other details 1, eval = eval_dsmb, echo = echo_dsmb}
df2_event_other %>% 
  filter(event_sae_2 == 'No') %>% #separate header for SAEs
  select(no, report_date_2, event_start_2, aware_date_2, event_hs_2, event_causality_2, event_status_2, event_ongoing_2, event_stop_2) %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

\newpage 

### Serious adverse events (full details)

|||
|---|---------|
| **Date of start of SAE ** | `r df_sae$event_start[1]` | 
|||
| **Date of site awareness of SAE ** | `r df_sae$aware_date[1]` | 
|||
| **Date of report of SAE** | `r df_sae$report_date[1]` | 
|||
| **Event name** | `r df_sae$event_name[1]` | 
|||
| **Seriousness criteria** | `r df_sae$sae_criteria[1]` | 
|||
| **Description** | `r tolower(df_sae$event_description[1])` | 
|||
| **Treatment** | `r tolower(df_sae$event_treatment[1])` | 
|||
| **Relevant investigations / laboratory results** | `r tolower(df_sae$event_investigation[1])` | 
|||
| **Event health status score** | `r df_sae$event_HS[1]` | 
|||
| **Causality relationship to study IP** | `r df_sae$event_causality[1]` | 
|||
| **Event status** | `r df_sae$event_status[1]`, `r ifelse(df_sae$event_ongoing[1] == 'Yes', 'Event ongoing', paste('Event stopped at ', format(as.Date(df_sae$event_stop[1], format = '%Y-%m-%d'), format = '%A, %d %B %Y'), sep = ''))` | 
||| 
| `r ifelse(is.na(df_sae$death_cause[1]) | df_sae$death_cause[1] == '', '', '**Cause of death**')` | `r ifelse(is.na(df_sae$death_cause[1]), '', as.character(df_sae$death_cause[1]))` |  

\newpage

# Appendix C: Additional figures

### Demographics as bar graphs (number of participants)

The following figures display the baseline demographics of the participants`r ifelse(CLOSED == T, " stratified by study arm", "")`.

### Age (bar graph)
```{r demographics age bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_demographics_numerical, aes(age, fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Age (year)') + 
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of age in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
  facet_grid(~group)

```

### Baseline body weight (bar graph)
```{r demographics weight bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_demographics_numerical, aes(round(weight), fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Baseline body weight (kg)') +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of baseline body weight in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() +
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(~group)

```

### Height (bar graph)
```{r demographics height bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_demographics_numerical[df_demographics_numerical$height > 75 & df_demographics_numerical$height != 999,], aes(height, fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Height (cm)') +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of height in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(~group)

```

### BMI (bar graph)
```{r demographics BMI bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
df_demographics_numerical %>%
  filter(height != 999) %>%
  filter(weight != 999) %>%

ggplot(aes(BMI, fill = factor(group))) + 
  geom_bar() + 
  # geom_density(aes(weight/(height/100 * height/100)), col = 'red') + #checking calculation
  geom_vline(xintercept = c(18.5, 25, 30, 40), linetype = 'dotted') + 
  scale_x_continuous(name = expression(paste('BMI (kg/', m^2, ')', sep = ''))) +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of BMI in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(~group)

```

*Dotted lines delimit categories underweight (<18.5), normal (18.5-24.9), overweight (25-29.9), obese (30-39.9), extremely obese (>40).*

### Current smoking (bar graph)
```{r demographics smoking bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_risk, aes(pack_years, fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Pack years') +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of current smoking in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(~group)

```


\newpage

# Appendix D: Additional data listings

The following variables are recorded in the different databases for each participant.

### Screening and enrolment

`r names(df1_full)`

### Events

`r names(df2_full)`

### Laboratory results

`r names(df3_full)`

### Follow-up

`r names(df4_full)`

\newpage

# Appendix E: Hospitalization by risk factor/classifier

The following Kaplan-Meier figures show Kaplan-Meier results for first hospitalization event stratified by the identified risk factors/classifiers.

\newpage

### Risk factor: age

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor age. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor age, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization age plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

hosp_km_censor_novac <- hosp_km_censor_novac %>%
    mutate(age_group = ifelse(age < 30, '18-30', ifelse(age < 50, '30-50', ifelse(age < 70, '50-70', '70+'))))

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + age_group, data = hosp_km_censor_novac)

km_age <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(8, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.11, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on age, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_age$plot + 
  facet_grid(~age_group) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot hospitalization age table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_age$table
```

\newpage

### Risk factor: gender

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor gender. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor gender, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization gender plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + gender, data = hosp_km_censor_novac)

km_gender <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.28, 1),
           # legend.labs = c('the ", Arm1, " arm\nfemale', 'the ", Arm1, " arm\nmale', 'the ", Arm2, " arm\nfemale', 'the ", Arm2, " arm\nmale'),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           # legend.title = 'Arm, gender', 
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on gender, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_gender$plot + facet_grid(~gender) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization gender table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_gender$table
```

\newpage

### Risk factor: Body mass index (BMI)

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BMI. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BMI, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization bmi plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
hosp_km_censor_novac <- hosp_km_censor_novac %>%
  mutate(BMI_group = ifelse(BMI < 18.5, '< 18.5', ifelse(BMI <= 24.9, '18.5-24.9', ifelse(BMI <= 29.9, '25-29.9', ifelse(BMI <= 39.9, '30-39.9', '40+')))))

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + BMI_group, data = hosp_km_censor_novac)

km_bmi <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(10, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.41, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on BMI, enhanced y-axis',
           conf.int = TRUE, 
           censor = T) 
km_bmi$plot + facet_grid(~BMI_group) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot hospitalization bmi table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_bmi$table
```

\newpage

### Classifier: ethnicity

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` classifier ethnicity. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` classifier ethnicity, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization ethnicity plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + ethnicity, data = hosp_km_censor_novac)

km_ethnicity <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(10, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on ethnicity',
           conf.int = TRUE, 
           censor = T)
km_ethnicity$plot + facet_grid(~ethnicity) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot hospitalization ethnicity table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_ethnicity$table 
```

\newpage

### Risk factor: job category

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor job category. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor job category, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization job_category plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + job_category, data = hosp_km_censor_novac)

km_job_category <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(8, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
            ylim = c(0.6, 1),
          legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on job category, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_job_category$plot + facet_grid(~job_category, labeller = labeller(job_category = c('Doctor' = 'Doctor', 'Nurse' = 'Nurse', 'Essential_workers' = 'Essential\nworkers', 'Support_staff' = 'Support\nstaff', 'Frontline_workers' = 'Frontline\nworkers'))) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization job_category table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_job_category$table
```

\newpage

### Risk factor: medical history of diabetes mellitus

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor diabetes mellitus. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor diabetes mellitus, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_dm plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_dm, data = hosp_km_censor_novac)

km_medhis_dm <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.55, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on diabetes mellitus, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_dm$plot + facet_grid(~medhis_dm) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot hospitalization medhis_dm table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_dm$table
```

\newpage

### Risk factor: medical history of hypertension

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor hypertension. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor hypertension, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_hyptens plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_hyptens, data = hosp_km_censor_novac)

km_medhis_hyptens <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.6, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on hypertension, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_hyptens$plot + facet_grid(~medhis_hyptens) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization medhis_hyptens table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_hyptens$table
```

\newpage

### Risk factor: medical history of cardiovascular disease

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor cardiovascular disease. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor cardiovascular disease, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_cvd plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_cvd, data = hosp_km_censor_novac)

km_medhis_cvd <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.6, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on cardiovascular disease, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_cvd$plot + facet_grid(~medhis_cvd) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization medhis_cvd table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_cvd$table
```

\newpage

### Risk factor: medical history of asthma

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor asthma. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor asthma, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_asthma plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_asthma, data = hosp_km_censor_novac)

km_medhis_asthma <- ggsurvplot(fit2, data = hosp_km_censor_novac,
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.6, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on asthma, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_asthma$plot + facet_grid(~medhis_asthma) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization medhis_asthma table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_asthma$table
```

\newpage

### Risk factor: medical history of COPD

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor COPD. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor COPD, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_copd plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_copd, data = hosp_km_censor_novac)

km_medhis_copd <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.29, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on COPD, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_copd$plot + facet_grid(~medhis_copd) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot hospitalization medhis_copd table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_copd$table
```

\newpage

### Risk factor: medical history of other lung disease

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor other lung disease. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor other lung disease, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_otherlung plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_otherlung, data = hosp_km_censor_novac)

km_medhis_otherlung <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.59, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on other lung disease, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_otherlung$plot + facet_grid(~medhis_otherlung) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization medhis_otherlung table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_otherlung$table
```

\newpage

### Risk factor: medical history of kidney disease

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor kidney disease. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor kidney disease, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization medhis_kd plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_kd, data = hosp_km_censor_novac)

km_medhis_kd <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.59, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on kidney disease, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_medhis_kd$plot  + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) + facet_grid(~medhis_kd)

```

```{r KM plot hospitalization medhis_kd table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_kd$table
```

\newpage

### Risk factor: presence of BCG scar

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BCG scar. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BCG scar, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot hospitalization bcg_scar plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + bcg_scar, data = hosp_km_censor_novac)

km_bcg_scar <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.54, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on BCG scar, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_bcg_scar$plot + facet_grid(~bcg_scar) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot hospitalization bcg_scar table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_bcg_scar$table
```

\newpage

### Risk factor: currently smoking

Kaplan-Meier estimates of the proportion of subject which were not hospitalized versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor currently smoking. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of being hospitalized, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor currently smoking, versus time. The y-axis is enhanced for clearer visualization. 

Currently smoking is grouped in 0, 1-20, 21-40, and >40 pack years.

```{r KM plot hospitalization pack_years plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
hosp_km_censor_novac <- hosp_km_censor_novac %>%
  mutate(smoke_group = ifelse(is.na(pack_years), 0, ifelse(pack_years == 0, '0', ifelse(pack_years <= 20, '1-20', ifelse(pack_years <= 40, '21-40', ifelse(pack_years > 40, '40<', 0))))))

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + smoke_group, data = hosp_km_censor_novac)

km_pack_years <- ggsurvplot(fit2, data = hosp_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(6, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion not hospitalized',
           ylim = c(0.42, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first hospitalization\nstratified on currently smoking, enhanced y-axis',
           conf.int = TRUE, 
           censor = T)
km_pack_years$plot + facet_grid(~smoke_group) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot hospitalization pack_years table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_pack_years$table
```

\newpage

# Appendix F: KM plots for respiratory tract infections 

Kaplan-Meier plots for all health status scores for the first RTI event.

### Health status 1: mild symptoms (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=1) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS1, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs1_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs1_km_censor_novac,
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.5, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for first RTI with mild symptoms\n(health score = 1, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )

```

### Health status 2: moderate symptoms (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=2) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS2, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs2_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs2_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.6, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for first RTI with moderate symptoms\n(health score = 2, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )


```

### Health status 3: severe symptoms (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=3) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS3, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs3_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs3_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.95, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for first RTI with severe symptoms\n(health score = 3, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )

```

### Health status 4: hospitalized (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=4) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS4, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs4_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs4_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.95, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for first RTI with hospitalization\n(health score = 4, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )
```

### Health status 5: hospitalized, oxygen (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=5) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS5, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs5_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs5_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.95, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for first RTI with hospitalization, oxygen\n(health score = 5, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )
```

### Health status 6: hospitalized, ventilated (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=6) versus time. Only first event is shown. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS6, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs6_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs6_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.95, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for first RTI with hospitalization, ventilated\n(health score = 6, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )

```

### Health status 7: death (RTI events)

Kaplan-Meier estimates of the proportion of subject which did not have an RTI event (HS=7) versus time. `r ifelse(CLOSED == T, " The figure is stratified by study arm.", "")` The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk of an RTI event`r ifelse(CLOSED == T, ", stratified by study arm", "")`, versus time. The y-axis is enhanced for clearer visualization. 

```{r KM plot HS7, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit <- survfit(Surv(time/7, status) ~ group, data = hs7_km_censor_novac)

#colour matching other figures
col_vir <- viridis(n = 2, end = 0.75, option = 'cividis')

ggsurvplot(fit, data = hs7_km_censor_novac, 
           palette = col_vir,
           ggtheme = theme_bw(), 
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           ylim = c(0.95, 1),
           risk.table = T, 
           legend.title = '', 
           title = 'Kaplan-Meier plot for RTI mortality\n(health score = 7, enhanced y-axis)',
           conf.int = TRUE,
           censor = T
           )

```


\newpage

# Appendix G: KM plots for RTI per risk factor/classifier

The following Kaplan-Meier figures show Kaplan-Meier results for RTI events stratified by the identified risk factors/classifiers.

### Risk factor: age

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor age. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor age, versus time.

```{r KM plot RTI events age plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

hs_rti_km_censor_novac <- hs_rti_km_censor_novac %>%
    mutate(age_group = ifelse(age < 30, '18-30', ifelse(age < 50, '30-50', ifelse(age < 70, '50-70', '70+'))))


#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + age_group, data = hs_rti_km_censor_novac)

km_age <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(8, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on age',
           conf.int = TRUE, 
           censor = T)
km_age$plot + 
  facet_grid(~age_group) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))
```

```{r KM plot RTI events age table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_age$table
```

\newpage

### Risk factor: gender

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor gender. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor gender, versus time.

```{r KM plot RTI events gender plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + gender, data = hs_rti_km_censor_novac)

km_gender <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           # legend.labs = c('the ", Arm1, " arm\nfemale', 'the ", Arm1, " arm\nmale', 'the ", Arm2, " arm\nfemale', 'the ", Arm2, " arm\nmale'),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           # legend.title = 'Arm, gender', 
           title = 'Kaplan-Meier plot for first RTI event\nstratified on gender',
           conf.int = TRUE, 
           censor = T)
km_gender$plot + facet_grid(~gender) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events gender table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_gender$table
```

\newpage

### Risk factor: Body mass index (BMI)

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BMI. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BMI, versus time.

```{r KM plot RTI events bmi plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
hs_rti_km_censor_novac <- hs_rti_km_censor_novac %>%
  mutate(BMI_group = ifelse(BMI < 18.5, '< 18.5', ifelse(BMI <= 24.9, '18.5-24.9', ifelse(BMI <= 29.9, '25-29.9', ifelse(BMI <= 39.9, '30-39.9', '40+')))))

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + BMI_group, data = hs_rti_km_censor_novac)

km_bmi <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(10, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on BMI',
           conf.int = TRUE, 
           censor = T)
km_bmi$plot + facet_grid(~BMI_group) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot RTI events bmi table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_bmi$table
```

\newpage

### Classifier: ethnicity

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` classifier ethnicity. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` classifier ethnicity, versus time.

```{r KM plot RTI events ethnicity plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + ethnicity, data = hs_rti_km_censor_novac)

km_ethnicity <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(10, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on ethnicity',
           conf.int = TRUE, 
           censor = T)
km_ethnicity$plot + facet_grid(~ethnicity) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot RTI events ethnicity table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_ethnicity$table 
```

\newpage

### Risk factor: job category

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor job category. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor job category, versus time.

```{r KM plot RTI events job_category plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + job_category, data = hs_rti_km_censor_novac)

km_job_category <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(8, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
            # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on job category',
           conf.int = TRUE, 
           censor = T)
km_job_category$plot + facet_grid(~job_category, labeller = labeller(job_category = c('Doctor' = 'Doctor', 'Nurse' = 'Nurse', 'Essential_workers' = 'Essential\nworkers', 'Support_staff' = 'Support\nstaff', 'Frontline_workers' = 'Frontline\nworkers'))) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events job_category table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_job_category$table
```

\newpage

### Risk factor: medical history of diabetes mellitus

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor diabetes mellitus. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor diabetes mellitus, versus time.

```{r KM plot RTI events medhis_dm plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_dm, data = hs_rti_km_censor_novac)

km_medhis_dm <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on diabetes mellitus',
           conf.int = TRUE, 
           censor = T)
km_medhis_dm$plot + facet_grid(~medhis_dm) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot RTI events medhis_dm table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_dm$table
```

\newpage

### Risk factor: medical history of hypertension

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor hypertension. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor hypertension, versus time.

```{r KM plot RTI events medhis_hyptens plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_hyptens, data = hs_rti_km_censor_novac)

km_medhis_hyptens <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on hypertension',
           conf.int = TRUE, 
           censor = T)
km_medhis_hyptens$plot + facet_grid(~medhis_hyptens) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events medhis_hyptens table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_hyptens$table
```

\newpage

### Risk factor: medical history of cardiovascular disease

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor cardiovascular disease. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor cardiovascular disease, versus time.

```{r KM plot RTI events medhis_cvd plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_cvd, data = hs_rti_km_censor_novac)

km_medhis_cvd <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on cardiovascular disease',
           conf.int = TRUE, 
           censor = T)
km_medhis_cvd$plot + facet_grid(~medhis_cvd) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events medhis_cvd table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_cvd$table
```

\newpage

### Risk factor: medical history of asthma

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor asthma. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor asthma, versus time.

```{r KM plot RTI events medhis_asthma plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_asthma, data = hs_rti_km_censor_novac)

km_medhis_asthma <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on asthma',
           conf.int = TRUE, 
           censor = T)
km_medhis_asthma$plot + facet_grid(~medhis_asthma) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events medhis_asthma table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_asthma$table
```

\newpage

### Risk factor: medical history of COPD

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor COPD. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor COPD, versus time.

```{r KM plot RTI events medhis_copd plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_copd, data = hs_rti_km_censor_novac)

km_medhis_copd <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on COPD',
           conf.int = TRUE, 
           censor = T)
km_medhis_copd$plot + facet_grid(~medhis_copd) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot RTI events medhis_copd table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_copd$table
```

\newpage

### Risk factor: medical history of other lung disease

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor other lung disease. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor other lung disease, versus time.

```{r KM plot RTI events medhis_otherlung plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_otherlung, data = hs_rti_km_censor_novac)

km_medhis_otherlung <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on other lung disease',
           conf.int = TRUE, 
           censor = T)
km_medhis_otherlung$plot + facet_grid(~medhis_otherlung) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events medhis_otherlung table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_otherlung$table
```

\newpage

### Risk factor: medical history of kidney disease

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor kidney disease. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor kidney disease, versus time.

```{r KM plot RTI events medhis_kd plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + medhis_kd, data = hs_rti_km_censor_novac)

km_medhis_kd <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on kidney disease',
           conf.int = TRUE, 
           censor = T)
km_medhis_kd$plot  + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) + facet_grid(~medhis_kd)

```

```{r KM plot RTI events medhis_kd table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_medhis_kd$table
```

\newpage

### Risk factor: presence of BCG scar

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BCG scar. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor BCG scar, versus time.

```{r KM plot RTI events bcg_scar plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + bcg_scar, data = hs_rti_km_censor_novac)

km_bcg_scar <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(4, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on BCG scar',
           conf.int = TRUE, 
           censor = T)
km_bcg_scar$plot + facet_grid(~bcg_scar) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7))

```

```{r KM plot RTI events bcg_scar table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_bcg_scar$table
```

\newpage

### Risk factor: currently smoking

Kaplan-Meier estimates of the proportion of subject without RTI events versus time. The figure is stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor currently smoking. The shaded area represent the 95% confidence interval of the Kaplan-Meier estimates. The figure is based on the intention-to-treat population, vertical lines show censoring events (withdrawal or lost to follow-up, death, or end of trial, whichever comes first). The table below the figure shows the number of subjects at risk, stratified by`r ifelse(CLOSED == T, " study arm and", "")` risk factor currently smoking, versus time.

Currently smoking is grouped in 0, 1-20, 21-40, and >40 pack years.

```{r KM plot RTI events pack_years plot, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=5}
hs_rti_km_censor_novac <- hs_rti_km_censor_novac %>%
  mutate(smoke_group = ifelse(is.na(pack_years), 0, ifelse(pack_years == 0, '0', ifelse(pack_years <= 20, '1-20', ifelse(pack_years <= 40, '21-40', ifelse(pack_years > 40, '40<', 0))))))

#model fit 
fit2 <- survfit(Surv(time/7, status) ~ group + smoke_group, data = hs_rti_km_censor_novac)

km_pack_years <- ggsurvplot(fit2, data = hs_rti_km_censor_novac, 
           ggtheme = theme_bw(), 
           palette = viridis(6, end = 0.75, option = 'viridis'),
           xlab = 'Time since vaccination (weeks)', 
           break.time.by = 10,
           ylab = 'Total proportion without RTI event',
           # ylim = c(0.8, 1),
           legend = 'left',
           legend.title = '',
           risk.table = T, 
           risk.table.height = 0.25, # the height of the risk table
           risk.table.fontsize = 3,
           title = 'Kaplan-Meier plot for first RTI event\nstratified on currently smoking',
           conf.int = TRUE, 
           censor = T)
km_pack_years$plot + facet_grid(~smoke_group) + theme(axis.text = element_text(size = 7), legend.text = element_text(size = 7)) 

```

```{r KM plot RTI events pack_years table, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
km_pack_years$table
```


\newpage

# Appendix H: Health status score over time

Health status score over time plots stratified by the identified risk factors/classifiers. 

### Risk factor: age

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor age`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.

```{r health status over time age, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group')) %>%
  mutate(age_group = ifelse(age < 30, '18-30', ifelse(age < 50, '30-50', ifelse(age < 70, '50-70', '70+'))))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~age_group)
```

### Risk factor: gender

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor gender`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time gender, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~gender)
```

\newpage

### Risk factor: body mass index (BMI)

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor BMI`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time BMI, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group')) %>%
  mutate(BMI_group = ifelse(BMI < 18.5, '< 18.5', ifelse(BMI <= 24.9, '18.5-24.9', ifelse(BMI <= 29.9, '25-29.9', ifelse(BMI <= 39.9, '30-39.9', '40+')))))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~BMI_group)
```

### Classifier: ethnicity

Health status over time for each subject in the study (dashed lines). The figure is stratified by the classifier ethnicity`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each classifier strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time ethnicity, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~ethnicity)
```

\newpage

### Risk factor: job category

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor job category`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time job, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~job_category)
```

### Risk factor: medical history of diabetes mellitus

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor diabetes mellitus`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_dm, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_dm)
```

\newpage

### Risk factor: medical history of hypertension

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor hypertension`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_hyptens, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_hyptens)
```

### Risk factor: medical history of cardiovascular disease

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor cardiovascular disease`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_cvd, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=7}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_cvd)
```

\newpage

### Risk factor: medical history of asthma

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor asthma`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_asthma, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_asthma)
```

### Risk factor: medical history of COPD

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor COPD`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_copd, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_copd)
```

\newpage

### Risk factor: medical history of other lung disease

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor other lung disease`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_otherlung, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_otherlung)
```

### Risk factor: medical history of kidney disease

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor kidney disease`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time medhis_kd, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~medhis_kd)
```

\newpage

### Risk factor: presence of BCG scar

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor BCG scar`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time bcg_scar, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group'))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~bcg_scar)
```

### Risk factor: currently smoking

Health status over time for each subject in the study (dashed lines). The figure is stratified by the risk factor currently smoking`r ifelse(CLOSED == T, " and coloured by the study arm", "")`. The solid lines represent a loess smooth of the score in each risk factor strata`r ifelse(CLOSED == T, " and study arm", "")` over time.


```{r health status over time pack_years, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=3}
hs_time_risk <- left_join(hs_time, df_risk, by = c('PID', 'group')) %>%
  mutate(smoke_group = ifelse(is.na(pack_years), 0, ifelse(pack_years == 0, '0', ifelse(pack_years <= 20, '1-20', ifelse(pack_years <= 40, '21-40', ifelse(pack_years > 40, '40+', 0))))))

hs_time_risk %>%
  filter(!is.na(HS)) %>%
  ggplot(aes(Time, HS, group = factor(group), col = factor(group))) + 
  geom_line(aes(group = PID), linetype = 'dashed', alpha = 0.25) + 
  geom_point() + 
  geom_smooth(se = F, method = 'loess') + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual health status score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  theme_bw() + 
  facet_grid(~smoke_group)
```

### Individual plots of health status scores over time

The figures below show the health status score over time per individual subject for RTI events, of which the event number is shown in bar above the plot.

```{r health status over time PIDs, eval = eval_dsmb, echo = echo_dsmb, message = F, fig.height=2}
PID_ind_hs_time <- hs_time %>%
  filter(!is.na(HS)) %>%
  group_by(PID) %>% 
  filter(sum(HS) > 0) %>% #do not plot only HS = 0 profiles
  distinct(PID) %>% 
  unlist()

for(i in PID_ind_hs_time){
  print(hs_time %>%
  filter(PID == i) %>%
  filter(!is.na(HS)) %>%
  filter(sum(HS) > 0) %>% #do not plot only HS = 0 profiles
  group_by(event_number) %>% 
  filter(sum(HS) > 0) %>%
  ungroup() %>%
  ggplot(aes(Time, HS)) + 
  geom_line() + 
  geom_point() + 
  scale_colour_viridis(discrete = T, end = 0.75, name = 'Arm', option = 'cividis') + 
  scale_x_continuous(name = 'Time since vaccination (week)') +
  scale_y_continuous(limits = c(0, 7), breaks = 0:7, name = 'Individual HS score') + 
  geom_hline(yintercept = 4, linetype = 'dotted') + 
  ggtitle(i) +
  theme_bw()+ 
  facet_grid(~event_number))}

```


\newpage

# Appendix I: MedDRA

Medical Dictionary for Regulatory Activities (MedDRA) terms are used in this report.

```{r meddra, eval = eval_dsmb, echo = echo_dsmb}
df2_meddra %>%
  distinct(LLT_code, .keep_all = T) %>% #publish only unique numbers
  arrange(LLT) %>% #order on PT
  select(LLT:SOC_code) %>%
  kable(row.names = T) %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

*LLT = Lowest level terms, PT = Preferred terms, HLT = High-level terms, HLGT = High-level group terms, SOC = System organ class*


\newpage

# Appendix J: Risk factors per site

## Baseline serology for SARS-CoV-2 per site

At baseline participants were tested for SARS-CoV-2 infection based on immunoglobulin G (IgG) serology.

### Central

`r ifelse(CLOSED == T, paste(c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Central', group == Arm1) %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Central', group == Arm1) %>% distinct(PID) %>% nrow()) / part_enrol_central_group1, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG in the ", Arm1, " arm.  \n", c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Central', group == Arm2) %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Central', group == Arm2) %>% distinct(PID) %>% nrow()) / part_enrol_central_group2, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG in the ", Arm2, " arm.", sep = ''), paste(c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Central') %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Central') %>% distinct(PID) %>% nrow()) / part_enrol_central, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG.", sep = ''))` 

### Eden

`r ifelse(CLOSED == T, paste(c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Eden', group == Arm1) %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Eden', group == Arm1) %>% distinct(PID) %>% nrow()) / part_enrol_eden_group1, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG in the ", Arm1, " arm.  \n", c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Eden', group == Arm2) %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Eden', group == Arm2) %>% distinct(PID) %>% nrow()) / part_enrol_eden_group2, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG in the ", Arm2, " arm.", sep = ''), paste(c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Eden') %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'Eden') %>% distinct(PID) %>% nrow()) / part_enrol_eden, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG.", sep = ''))` 

### UCT

`r ifelse(CLOSED == T, paste(c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'UCT', group == Arm1) %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'UCT', group == Arm1) %>% distinct(PID) %>% nrow()) / part_enrol_uct_group1, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG in the ", Arm1, " arm.  \n", c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'UCT', group == Arm2) %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'UCT', group == Arm2) %>% distinct(PID) %>% nrow()) / part_enrol_uct_group2, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG in the ", Arm2, " arm.", sep = ''), paste(c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'UCT') %>% distinct(PID) %>% nrow(), " (", round(100 * (c19_serology %>% filter(visit_week_id_3 == 0) %>% filter(site == 'UCT') %>% distinct(PID) %>% nrow()) / part_enrol_uct, 1), "%) participants had a baseline positive serology result for SARS-CoV-2 based on IgG.", sep = ''))` 


## Baseline tuberculosis per site

At baseline, participants are tested for TB infection based on an interferon-gamma release assay (IGRA).

### Central 

`r ifelse(CLOSED == T, paste(df_full %>% filter(igra == 'POS') %>% filter(site == 'Central', group == Arm1) %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'Central', group == Arm1) %>% distinct(PID) %>% nrow()) / part_enrol_central_group1, 1), "%) participants had a baseline positive result for IGRA result for TB in the ", Arm1, " arm.  \n", df_full %>% filter(igra == 'POS') %>% filter(site == 'Central', group == Arm2) %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'Central', group == Arm2) %>% distinct(PID) %>% nrow()) / part_enrol_central_group2, 1), "%) participants had a baseline positive result for IGRA result for TB in the ", Arm2, " arm.", sep = ''), paste(df_full %>% filter(igra == 'POS') %>% filter(site == 'Central') %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'Central') %>% distinct(PID) %>% nrow()) / part_enrol_central, 1), "%) participants had a baseline positive result for IGRA result for TB.", sep = ''))` 

### Eden 

`r ifelse(CLOSED == T, paste(df_full %>% filter(igra == 'POS') %>% filter(site == 'Eden', group == Arm1) %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'Eden', group == Arm1) %>% distinct(PID) %>% nrow()) / part_enrol_eden_group1, 1), "%) participants had a baseline positive result for IGRA result for TB in the ", Arm1, " arm.  \n", df_full %>% filter(igra == 'POS') %>% filter(site == 'Eden', group == Arm2) %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'Eden', group == Arm2) %>% distinct(PID) %>% nrow()) / part_enrol_eden_group2, 1), "%) participants had a baseline positive result for IGRA result for TB in the ", Arm2, " arm.", sep = ''), paste(df_full %>% filter(igra == 'POS') %>% filter(site == 'Eden') %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'Eden') %>% distinct(PID) %>% nrow()) / part_enrol_eden, 1), "%) participants had a baseline positive result for IGRA result for TB.", sep = ''))` 

### UCT 

`r ifelse(CLOSED == T, paste(df_full %>% filter(igra == 'POS') %>% filter(site == 'UCT', group == Arm1) %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'UCT', group == Arm1) %>% distinct(PID) %>% nrow()) / part_enrol_uct_group1, 1), "%) participants had a baseline positive result for IGRA result for TB in the ", Arm1, " arm.  \n", df_full %>% filter(igra == 'POS') %>% filter(site == 'UCT', group == Arm2) %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'UCT', group == Arm2) %>% distinct(PID) %>% nrow()) / part_enrol_uct_group2, 1), "%) participants had a baseline positive result for IGRA result for TB in the ", Arm2, " arm.", sep = ''), paste(df_full %>% filter(igra == 'POS') %>% filter(site == 'UCT') %>% distinct(PID) %>% nrow(), " (", round(100 * (df_full %>% filter(igra == 'POS') %>% filter(site == 'UCT') %>% distinct(PID) %>% nrow()) / part_enrol_uct, 1), "%) participants had a baseline positive result for IGRA result for TB.", sep = ''))` 

\newpage

## Numerical demographics

### Central

Missing values are reported as NA in the `r ifelse(CLOSED == T, 'tables', 'table')` below.

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r numerical demographics table group1 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(group == Arm1) %>%
  filter(site == 'Central') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r numerical demographics table group2 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical %>%
  filter(group == Arm2) %>%
  filter(site == 'Central') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

```{r numerical demographics table c, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(site == 'Central') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

### Eden

Missing values are reported as NA in the `r ifelse(CLOSED == T, 'tables', 'table')` below.

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r numerical demographics table group1 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(group == Arm1) %>%
  filter(site == 'Eden') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r numerical demographics table group2 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical %>%
  filter(group == Arm2) %>%
  filter(site == 'Eden') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

```{r numerical demographics table e, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(site == 'Eden') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

### UCT

Missing values are reported as NA in the `r ifelse(CLOSED == T, 'tables', 'table')` below.

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r numerical demographics table group1 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(group == Arm1) %>%
  filter(site == 'UCT') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r numerical demographics table group2 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_numerical %>%
  filter(group == Arm2) %>%
  filter(site == 'UCT') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```

```{r numerical demographics table u, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_numerical$BMI[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$BMI[df_demographics_numerical$weight == 999] <- NA
df_demographics_numerical$height[df_demographics_numerical$height == 999] <- NA
df_demographics_numerical$weight[df_demographics_numerical$weight == 999] <- NA

df_demographics_numerical %>%
  filter(site == 'UCT') %>%
  select(!c(group, site)) %>%
  summary_table() %>%
  kable()
```


\newpage
## Binary demographics

Empty records are assigned either '' (blank) or NA.

### Central

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r binary demographics table group1 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm1) %>%
  filter(site == 'Central') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r binary demographics table group2 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm2) %>%
  filter(site == 'Central') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

```{r binary demographics table c, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(site == 'Central') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```


### Eden

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r binary demographics table group1 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm1) %>%
  filter(site == 'Eden') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r binary demographics table group2 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm2) %>%
  filter(site == 'Eden') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

```{r binary demographics table e, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(site == 'Eden') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```


### UCT

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r binary demographics table group1 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm1) %>%
  filter(site == 'UCT') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r binary demographics table group2 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(group == Arm2) %>%
  filter(site == 'UCT') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```

```{r binary demographics table u, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_binary %>%
  filter(site == 'UCT') %>% 
  na_if('') %>%
  select(!c(group, site)) %>%
  summary() %>%
  kable()
```


\newpage

## Categorical demographics

### Central

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r categorical demographics table group1 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm1) %>%
  filter(site == 'Central') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r categorical demographics table group2 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm2) %>%
  filter(site == 'Central') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

```{r categorical demographics table c, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(site == 'Central') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

### Eden

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r categorical demographics table group1 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm1) %>%
  filter(site == 'Eden') %>% 
  mutate(ethnicity = factor(ifelse(ethnicity != "Other", as.character(ethnicity), as.character(ethnicity.other_comment)))) %>%
  mutate(country_birth = factor(ifelse(country_birth != "Other", as.character(country_birth), as.character(country_birth.other_comment)))) %>%
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r categorical demographics table group2 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm2) %>%
  filter(site == 'Eden') %>% 
  mutate(ethnicity = factor(ifelse(ethnicity != "Other", as.character(ethnicity), as.character(ethnicity.other_comment)))) %>%
  mutate(country_birth = factor(ifelse(country_birth != "Other", as.character(country_birth), as.character(country_birth.other_comment)))) %>%
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

```{r categorical demographics table e, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(site == 'Eden') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

### UCT

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r categorical demographics table group1 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm1) %>%
  filter(site == 'UCT') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r categorical demographics table group2 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(group == Arm2) %>%
  filter(site == 'UCT') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

```{r categorical demographics table u, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_demographics_categorical %>%
  filter(site == 'UCT') %>% 
  select(!ethnicity.other_comment) %>%
  select(!country_birth.other_comment) %>%
  select(!group) %>%
  select(!site) %>% 
  summary() %>%
  kable()
```

\newpage

## Work related demographics, including expected exposure to COVID-19

### Central

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r work demographics table group1 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm1) %>%
  filter(site == 'Central') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r work demographics table group2 c, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm2) %>%
  filter(site == 'Central') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

```{r work demographics table c, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(site == 'Central') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

*Legend work_hours: 1_40 means 1-40 hour/week, 41_80 means 41-80 hour/week, more_than_80 means more than 80 hour/week.*

\newpage

### Eden

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r work demographics table group1 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm1) %>%
  filter(site == 'Eden') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r work demographics table group2 e, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm2) %>%
  filter(site == 'Eden') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

```{r work demographics table e, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(site == 'Eden') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

*Legend work_hours: 1_40 means 1-40 hour/week, 41_80 means 41-80 hour/week, more_than_80 means more than 80 hour/week.*

\newpage

### UCT

`r ifelse(CLOSED == T, paste("**the ", Arm1, " arm**", sep = ''), "")`
```{r work demographics table group1 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm1) %>%
  filter(site == 'UCT') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

`r ifelse(CLOSED == T, paste("**the ", Arm2, " arm**", sep = ''), "")`
```{r work demographics table group2 u, eval = eval_dsmb & CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(group == Arm2) %>%
  filter(site == 'UCT') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

```{r work demographics table u, eval = eval_dsmb & !CLOSED, echo = echo_dsmb}
df_work_categorical %>%
  filter(site == 'UCT') %>% 
  select(!c(group, site)) %>%
  summary() %>%
  kable() %>%
  kable_styling(latex_options = c('scale_down', 'HOLD_position'))
```

*Legend work_hours: 1_40 means 1-40 hour/week, 41_80 means 41-80 hour/week, more_than_80 means more than 80 hour/week.*

\newpage 

## Additional figures per site

### Demographics as bar graphs (number of participants)

The following figures display the baseline demographics of the participants`r ifelse(CLOSED == T, " stratified by study arm", "")`.

### Age (bar graph)
```{r appJ demographics age bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_demographics_numerical, aes(age, fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Age (year)') + 
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of age in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(site~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
  facet_grid(site~~group)

```

### Baseline body weight (bar graph)
```{r appJ demographics weight bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_demographics_numerical, aes(round(weight), fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Baseline body weight (kg)') +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of baseline body weight in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(site~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(site~~group)

```

### Height (bar graph)
```{r appJ demographics height bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_demographics_numerical[df_demographics_numerical$height > 75 & df_demographics_numerical$height != 999,], aes(height, fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Height (cm)') +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of height in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(site~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(site~~group)

```

### BMI (bar graph)
```{r appJ demographics BMI bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
df_demographics_numerical %>%
  filter(height != 999) %>%
  filter(weight != 999) %>%

ggplot(aes(BMI, fill = factor(group))) + 
  geom_bar() + 
  # geom_density(aes(weight/(height/100 * height/100)), col = 'red') + #checking calculation
  geom_vline(xintercept = c(18.5, 25, 30, 40), linetype = 'dotted') + 
  scale_x_continuous(name = expression(paste('BMI (kg/', m^2, ')', sep = ''))) +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of BMI in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(site~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(site~~group)

```

*Dotted lines delimit categories underweight (<18.5), normal (18.5-24.9), overweight (25-29.9), obese (30-39.9), extremely obese (>40).*

### Current smoking (bar graph)
```{r appJ demographics smoking bar, eval = eval_dsmb, echo = echo_dsmb, fig.height = 3}
ggplot(df_risk, aes(pack_years, fill = factor(group))) + 
  geom_bar() + 
  scale_x_continuous(name = 'Pack years') +  
  scale_y_continuous(name = 'Number of participants') + 
  scale_fill_viridis(discrete = T, end = 0.75, name = 'Arm', guide = F, option = 'cividis') +
  ggtitle(paste('Distribution of current smoking in the trial population', ifelse(CLOSED == T, " stratified per arm", ""), sep = '')) + 
  theme_bw() + 
  # facet_grid(site~group, labeller = labeller(group = c('Both'='Total trial population', '1' = '1', '2' = '2')))
    facet_grid(site~~group)

```

\vspace*{\fill} <!-- getting compilation time at the bottom of the page -->

This report took `r (proc.time() - ptm)[3]` seconds to compile.