.Rhistory

correct_bias = function(){
cat(".")
# randomly sample from each distribution
samples = apply(distributions, 2, function(x) sample(x, size=1, replace=TRUE))
# corrected case count
Astar = calc_A_star(N = N,
N_tested = N_tested,
P_S_tested = samples[which(names(samples) == "dist_P_S_tested")],
P_S_untested = samples[which(names(samples) == "dist_P_S_untested")],
P_testpos_S = samples[which(names(samples) == "dist_P_testpos_S")],
P_testpos_A = samples[which(names(samples) == "dist_P_testpos_A")],
Se = samples[which(names(samples) == "dist_Se")],
Sp = samples[which(names(samples) == "dist_Sp")]
)
out = data.frame(
Astar = Astar,
N = N,
N_tested = N_tested,
P_S_tested = samples[which(names(samples) == "dist_P_S_tested")],
P_S_untested = samples[which(names(samples) == "dist_P_S_untested")],
P_testpos_S = samples[which(names(samples) == "dist_P_testpos_S")],
P_testpos_A = samples[which(names(samples) == "dist_P_testpos_A")],
Se = samples[which(names(samples) == "dist_Se")],
Sp = samples[which(names(samples) == "dist_Sp")]
)
return(out)
}
#----------------------------------------
# Obtain corrected case estimates
#----------------------------------------
set.seed(123)
reps = 1000
result = replicate(reps, correct_bias())
result_long = as.data.frame(matrix(result, nrow=reps, byrow=TRUE))
colnames(result_long) = c(
"exp_cases", "N", "N_tested", "P_S_tested", "P_S_untested",
"P_testpos_S", "P_testpos_A", "Se", "Sp"
)
for(i in 1:ncol(result_long)){
result_long[,i] = unlist(result_long[,i])
}
return(result_long)
}
generate_corrected_sample(N = 331002651, N_tested = 103945, distributions = simdata)
# get expected number of cases accounting for insufficient testing
# assume sensitivity and specificity = 100%
calc_A_star = function(N, N_tested, P_S_tested, P_S_untested, P_testpos_S, P_testpos_A, Se, Sp){
N_untested = N - N_tested
# estimate proportion of population that is asymptomatic and symptomatic
P_S = (1/N) * ((P_S_tested * N_tested) + (P_S_untested * N_untested))
P_A = 1 - P_S
# estimate expected asymptomatic and symptomatic case counts
# if everyone was tested
A_star_S = N * P_testpos_S * P_S
A_star_A = N * P_testpos_A * P_A
A_star = A_star_S + A_star_A
# correct for imperfect sensitivity and specificity
A = (A_star - ((1 - Sp) * N)) / (Se + Sp - 1)
return(A)
}
calc_A_star(N = 331002651,      # Population size
N_tested = 103945,  # Number tested
P_S_tested = 0.8,   # P(S|tested)
P_S_untested = 0.2, # P(S|untested)
P_testpos_S = 0.8,  # P(test+ | S, tested)
P_testpos_A = 0.2,  # P(A |test+, tested)
Se = 0.8,           # Sensitivity of test
Sp =  0.8           # Specificity of test
)
# probability of being symptomatic among those tested
set.seed(123)
dist_P_S_tested = rbeta(n = 100000, shape1 = 20, shape2 = 1.4)
# probability of being symptomatic among those not tested
set.seed(123)
dist_P_S_untested = rbeta(n = 100000, shape1 = 3, shape2 = 11)
# probability of testing positive among the symptomatic
set.seed(123)
# probability of testing positive among the symptomatic
set.seed(123)
dist_P_testpos_S = rbeta(n = 100000, shape1 = 40, shape2 = 10)
# distribution of sensitivity of test
set.seed(123)
# distribution of specificity of test
set.seed(123)
simdata = data.frame(dist_P_S_tested = dist_P_S_tested,
dist_P_S_untested = dist_P_S_untested,
dist_P_testpos_S = dist_P_testpos_S,
dist_P_testpos_A = dist_P_testpos_A,
dist_Se = dist_Se,
dist_Sp = dist_Sp)
# calculate P(A|test+)
est_P_A_testpos = function(P_S_tested, P_S_untested, P_testpos_S, P_testpos_A,
N_tested, N){
# scale up tested population assuming everyone was tested,
# stratify by symptom status
N_S = (P_S_tested * N_tested) + (P_S_untested*(N - N_tested))
N_A = (N_tested - (P_S_tested * N_tested)) +
(N - N_tested - (P_S_untested*(N - N_tested)))
N_A_testpos = P_testpos_A * N_A
N_S_testpos = P_testpos_S * N_S
P_A_testpos = N_A_testpos / (N_A_testpos + N_S_testpos)
return(P_A_testpos)
}
simdata = data.frame(dist_P_S_tested = dist_P_S_tested,
dist_P_S_untested = dist_P_S_untested,
dist_P_testpos_S = dist_P_testpos_S,
dist_P_testpos_A = dist_P_testpos_A,
dist_Se = dist_Se,
dist_Sp = dist_Sp)
dist_P_testpos_A = rbeta(n = 100000, shape1 = 5, shape2 = 20)
# distribution of sensitivity of test
set.seed(123)
dist_Se = rbeta(n = 100000, shape1 = 20, shape2 = 2.5)
# distribution of specificity of test
set.seed(123)
dist_Sp = rbeta(n = 100000, shape1 = 40, shape2 = 4)
simdata = data.frame(dist_P_S_tested = dist_P_S_tested,
dist_P_S_untested = dist_P_S_untested,
dist_P_testpos_S = dist_P_testpos_S,
dist_P_testpos_A = dist_P_testpos_A,
dist_Se = dist_Se,
dist_Sp = dist_Sp)
# calculate P(A|test+)
est_P_A_testpos = function(P_S_tested, P_S_untested, P_testpos_S, P_testpos_A,
N_tested, N){
# scale up tested population assuming everyone was tested,
# stratify by symptom status
N_S = (P_S_tested * N_tested) + (P_S_untested*(N - N_tested))
N_A = (N_tested - (P_S_tested * N_tested)) +
(N - N_tested - (P_S_untested*(N - N_tested)))
N_A_testpos = P_testpos_A * N_A
N_S_testpos = P_testpos_S * N_S
P_A_testpos = N_A_testpos / (N_A_testpos + N_S_testpos)
return(P_A_testpos)
}
dist_P_A_testpos = matrix(NA, nrow = nrow(simdata), ncol = 1)
for(i in 1:nrow(simdata)){
dist_P_A_testpos[i,] = est_P_A_testpos(
P_S_tested = simdata$dist_P_S_tested[i],
P_S_untested = simdata$dist_P_S_untested[i],
P_testpos_S = simdata$dist_P_testpos_S[i],
P_testpos_A = simdata$dist_P_testpos_A[i],
N_tested = 103945,
N = 331002651
)
}
simdata = simdata %>% mutate(dist_P_A_testpos = dist_P_A_testpos)
simdatal = melt(simdata)
simdata = simdata %>% dplyr::select(
dist_P_S_tested, dist_P_S_untested,
dist_P_testpos_S, dist_P_testpos_A,
dist_P_A_testpos, dist_Se, dist_Sp
)
table = data.frame(
label = c(
"Distribution of P(Symptomatic | tested)",
"Distribution of P(Symptomatic | not tested)",
"Distribution of P(Test + | Symptomatic)",
"Distribution of P(Test + | Asymptomatic)",
"Distribution of P(Asymptomatic | Test +)",
"Distribution of test sensitivity",
"Distribution of test specificity"
),
min =    sprintf("%0.02f", apply(simdata, 2, min)),
median = sprintf("%0.02f", apply(simdata, 2, median)),
max =    sprintf("%0.02f", apply(simdata, 2, max))
)
rownames(table) = NULL
kable(table) %>%
kable_styling()
simdatal = simdatal %>% mutate(label = case_when(
variable == "dist_P_S_tested" ~ "P(Symptomatic | tested)",
variable == "dist_P_S_untested" ~  "P(Symptomatic | not tested)",
variable == "dist_P_testpos_S" ~  "P(Test + | Symptomatic)",
variable == "dist_P_testpos_A" ~  "P(Test + | Asymptomatic)",
variable == "dist_P_A_testpos" ~  "P(Asymptomatic | Test +)",
variable == "dist_Se" ~  "Test sensitivity",
variable == "dist_Sp" ~  "Test specificity"
))
ggplot(simdatal, aes(x = value)) +
geom_histogram(color="black", fill="white", bins=80) +
facet_wrap(~label) +
theme_bw() + xlab("") + ylab("")
generate_corrected_sample = function(N, N_tested, distributions){
correct_bias = function(){
cat(".")
# randomly sample from each distribution
samples = apply(distributions, 2, function(x) sample(x, size=1, replace=TRUE))
# corrected case count
Astar = calc_A_star(N = N,
N_tested = N_tested,
P_S_tested = samples[which(names(samples) == "dist_P_S_tested")],
P_S_untested = samples[which(names(samples) == "dist_P_S_untested")],
P_testpos_S = samples[which(names(samples) == "dist_P_testpos_S")],
P_testpos_A = samples[which(names(samples) == "dist_P_testpos_A")],
Se = samples[which(names(samples) == "dist_Se")],
Sp = samples[which(names(samples) == "dist_Sp")]
)
out = data.frame(
Astar = Astar,
N = N,
N_tested = N_tested,
P_S_tested = samples[which(names(samples) == "dist_P_S_tested")],
P_S_untested = samples[which(names(samples) == "dist_P_S_untested")],
P_testpos_S = samples[which(names(samples) == "dist_P_testpos_S")],
P_testpos_A = samples[which(names(samples) == "dist_P_testpos_A")],
Se = samples[which(names(samples) == "dist_Se")],
Sp = samples[which(names(samples) == "dist_Sp")]
)
return(out)
}
#----------------------------------------
# Obtain corrected case estimates
#----------------------------------------
set.seed(123)
reps = 1000
result = replicate(reps, correct_bias())
result_long = as.data.frame(matrix(result, nrow=reps, byrow=TRUE))
colnames(result_long) = c(
"exp_cases", "N", "N_tested", "P_S_tested", "P_S_untested",
"P_testpos_S", "P_testpos_A", "Se", "Sp"
)
for(i in 1:ncol(result_long)){
result_long[,i] = unlist(result_long[,i])
}
return(result_long)
}
generate_corrected_sample(N = 331002651, N_tested = 103945, distributions = simdata)
corrected_sample = generate_corrected_sample(N = 331002651, N_tested = 103945, distributions = simdata)
corrected_sample
median(corrected_sample$exp_cases)
library(gsheet)
install.packages("gsheet2text")
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
library(foreign)
library(epitools)
library(epiR)
library(ORCI)
library(knitr)
library(ggplot2)
library(stringr)
library(epitools)
library(epiR)
library(lmtest)
expOesoph9 = read.dta("data/expOesoph9.dta")
head(expOesoph9)
# Set alcgp_binary = 0 if consumption < 80 g/day, alcgp_binary = 1 if consumption >= 80 g/day
expOesoph9_dichotomous =  expOesoph9 %>% mutate("alcgp_binary" = ifelse((alcgp == 0 |alcgp == 1), 0, 1))
expOesoph9_dichotomous_model <- glm(formula = casestatus ~ alcgp_binary,
family = binomial(link='logit'), data = expOesoph9_dichotomous)
lrtest(expOesoph9_dichotomous_model)
summary(expOesoph9_dichotomous_model)
or_est = exp(expOesoph9_dichotomous_model$coefficients["alcgp_binary"])
or_ci_bounds = exp(confint(expOesoph9_dichotomous_model, level=.95)["alcgp_binary",])
print(paste0("Odds Ratio Point Estimate: ", round(or_est, 3)))
print(paste0("Odds Ratio 95% CI: [", round(or_ci_bounds[1], 3), ", ", round(or_ci_bounds[2], 3), "]"))
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
expOesoph9_multivar_model
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
lrtest(expOesoph9_multivar_model)
expOesoph9_multivar_model$coefficients
multivar_coeff = exp(expOesoph9_multivar_model$coefficients)[-1]
multivar_coeff
multivar_cis = exp(confint(expOesoph9_multivar_model, level=.95))[-1,]
multivar_cis
for (i in 1:3){
ci_bounds = multivar_cis[i, ]
print(paste0("For alcgp level ", i, ":"))
print(paste0("- Odds Ratio (with alcgp = 0 as reference level): ", round(multivar_coeff[i],3)))
print(paste0("- Odds Ratio 95% CI: [", round(ci_bounds[1], 3), ", ", round(ci_bounds[2], 3), "]"))
}
expOesoph9_ordered_model = glm(formula = casestatus ~ alcgp,
family = binomial(link='logit'), data = expOesoph9)
lrtest(expOesoph9_ordered_model)
or_ordered = exp(expOesoph9_ordered_model$coefficients["alcgp"])
or_ci_ordered = exp(confint(expOesoph9_ordered_model, level=.95)["alcgp",])
or_ci_ordered
or_ci_ordered[1]
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
expOesoph9_ordered_model = glm(formula = casestatus ~ alcgp,
family = binomial(link='logit'), data = expOesoph9)
gof_test_stat = 2 * (logLik(expOesoph9_multivar_model) - logLik(expOesoph9_ordered_model))
gof_test_stat = 2 * (logLik(expOesoph9_multivar_model) - logLik(expOesoph9_ordered_model))
as.numeric(pchisq(gof_test_stat, df=2, lower.tail=FALSE))
gof_test_stat
gof_test_stat, as.numeric(pchisq(gof_test_stat, df=2, lower.tail=FALSE))
gof_test_stat = 2 * (logLik(expOesoph9_multivar_model) - logLik(expOesoph9_ordered_model))
gof_test_stat
as.numeric(pchisq(gof_test_stat, df=2, lower.tail=FALSE))
or_ordered = exp(expOesoph9_ordered_model$coefficients["alcgp"])
or_ci_ordered = exp(confint(expOesoph9_ordered_model, level=.95)["alcgp",])
print(paste0("Odds Ratio associated with each one unit increase of alcohol consumption: ",
round(or_ordered, 3)))
print(paste0("95% CI: [", round(or_ci_ordered[1], 3), ", ", round(or_ci_ordered[2], 3), "]"))
print(paste0("Odds Ratio associated with each one unit increase of alcohol consumption: ",
round(or_ordered, 3),
"\n95% CI: [", round(or_ci_ordered[1], 3), ", ", round(or_ci_ordered[2], 3), "]"))
print(paste0("Odds Ratio associated with each one unit increase of alcohol consumption: ",
round(or_ordered, 3),
"\n 95% CI: [", round(or_ci_ordered[1], 3), ", ", round(or_ci_ordered[2], 3), "]"))
print(paste0("Odds Ratio Point Estimate: ", round(or_est, 3),
"(", round(or_ci_bounds[1], 3), ", ", round(or_ci_bounds[2], 3), ")"))
print(paste0("Odds Ratio Point Estimate: ", round(or_est, 3),
" (", round(or_ci_bounds[1], 3), ", ", round(or_ci_bounds[2], 3), ")"))
print(paste0("Odds Ratio: ", round(or_est, 3),
" (", round(or_ci_bounds[1], 3), ", ", round(or_ci_bounds[2], 3), ")"))
for (i in 1:3){
ci_bounds = multivar_cis[i, ]
print(paste0("For alcgp level ", i, ":"))
print(paste0("     Odds Ratio (with reference level alcgp = 0): ", round(multivar_coeff[i],3),
"(", round(ci_bounds[1], 3), ", ", round(ci_bounds[2], 3), ")"))
}
for (i in 1:3){
ci_bounds = multivar_cis[i, ]
print(paste0("For alcgp level ", i, ":"))
print(paste0("  Odds Ratio (with reference level alcgp = 0): ", round(multivar_coeff[i],3),
"(", round(ci_bounds[1], 3), ", ", round(ci_bounds[2], 3), ")"))
}
for (i in 1:3){
ci_bounds = multivar_cis[i, ]
print(paste0("For alcgp level ", i, ":"))
print(paste0("  Odds Ratio (with reference level alcgp = 0): ", round(multivar_coeff[i],3),
" (", round(ci_bounds[1], 3), ", ", round(ci_bounds[2], 3), ")"))
}
print(paste0("Odds Ratio associated with each one unit increase of alcohol consumption: ",
round(or_ordered, 3),
"(", round(or_ci_ordered[1], 3), ", ", round(or_ci_ordered[2], 3), ")"))
print(paste0("Odds Ratio associated with each one unit increase of alcohol consumption: ",
round(or_ordered, 3),
" (", round(or_ci_ordered[1], 3), ", ", round(or_ci_ordered[2], 3), ")"))
knitr::opts_chunk$set(echo = TRUE)
library(tidyr)
library(dplyr)
library(foreign)
library(epitools)
library(epiR)
library(ORCI)
library(knitr)
library(ggplot2)
library(stringr)
library(epitools)
library(epiR)
library(lmtest)
expOesoph9 = read.dta("data/expOesoph9.dta")
head(expOesoph9)
# Set alcgp_binary = 0 if consumption < 80 g/day, alcgp_binary = 1 if consumption >= 80 g/day
expOesoph9_dichotomous =  expOesoph9 %>% mutate("alcgp_binary" = ifelse((alcgp == 0 |alcgp == 1), 0, 1))
expOesoph9_dichotomous_model <- glm(formula = casestatus ~ alcgp_binary,
family = binomial(link='logit'), data = expOesoph9_dichotomous)
lrtest(expOesoph9_dichotomous_model)
or_est = exp(expOesoph9_dichotomous_model$coefficients["alcgp_binary"])
or_ci_bounds = exp(confint(expOesoph9_dichotomous_model, level=.95)["alcgp_binary",])
print(paste0("Odds Ratio: ", round(or_est, 3),
" (", round(or_ci_bounds[1], 3), ", ", round(or_ci_bounds[2], 3), ")"))
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
lrtest(expOesoph9_multivar_model)
multivar_coeff = exp(expOesoph9_multivar_model$coefficients)[-1]
multivar_cis = exp(confint(expOesoph9_multivar_model, level=.95))[-1,]
for (i in 1:3){
ci_bounds = multivar_cis[i, ]
print(paste0("For alcgp level ", i, ":"))
print(paste0("  Odds Ratio (with reference level alcgp = 0): ", round(multivar_coeff[i],3),
" (", round(ci_bounds[1], 3), ", ", round(ci_bounds[2], 3), ")"))
}
expOesoph9_ordered_model = glm(formula = casestatus ~ alcgp,
family = binomial(link='logit'), data = expOesoph9)
lrtest(expOesoph9_ordered_model)
or_ordered = exp(expOesoph9_ordered_model$coefficients["alcgp"])
or_ci_ordered = exp(confint(expOesoph9_ordered_model, level=.95)["alcgp",])
print(paste0("Odds Ratio associated with each one unit increase of alcohol consumption: ",
round(or_ordered, 3),
" (", round(or_ci_ordered[1], 3), ", ", round(or_ci_ordered[2], 3), ")"))
expOesoph9_multivar_model <- glm(formula = casestatus ~ as.factor(alcgp) ,
family = binomial(link='logit'), data = expOesoph9)
expOesoph9_ordered_model = glm(formula = casestatus ~ alcgp,
family = binomial(link='logit'), data = expOesoph9)
gof_test_stat = 2 *  (logLik(expOesoph9_multivar_model) - logLik(expOesoph9_ordered_model))
as.numeric(pchisq(gof_test_stat, df=2, lower.tail=FALSE))
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(purrr)
questionnaire = read.csv("data/questionnaire.csv")
meds = read.csv("data/medications.csv")
labs = read.csv("data/labs.csv")
exams = read.csv("data/examination.csv")
diet = read.csv("data/diet.csv")
demog = read.csv("data/demographic.csv")
data_list = list(questionnaire, meds, labs, exams, diet, demog)
questionnaire_vars = c("SEQN", "FSD032B", "FSD032C", "FSDHH")
meds_vars = c("SEQN")
labs_vars = c("SEQN")
exams_vars = c("SEQN")
diet_vars = c("SEQN", "DR1TSFAT", "DR1TCARB")
demog_vars = c("SEQN", "DMDEDUC2", "RIDAGEYR", "RIAGENDR")
var_list = list(questionnaire_vars, meds_vars, labs_vars, exams_vars, diet_vars, demog_vars)
filtered_data = lapply(1:length(data_list), function(x) data_list[[x]] %>% select(var_list[[x]]) %>% drop_na())
full_data = filtered_data %>% reduce(full_join, by = "SEQN")
head(full_data)
full_data %>% group_by(SEQN) %>% summarize(N=n())
questionnaire %>% filter(SEQN = 73562)
questionnaire %>% filter(SEQN == 73562)
numrows(meds %>% filter(SEQN == 73562))
nrows(meds %>% filter(SEQN == 73562))
nrow(meds %>% filter(SEQN == 73562))
meds %>% filter(SEQN == 73562)
questionnaire = read.csv("data/questionnaire.csv")
labs = read.csv("data/labs.csv")
exams = read.csv("data/examination.csv")
diet = read.csv("data/diet.csv")
demog = read.csv("data/demographic.csv")
data_list = list(questionnaire, labs, exams, diet, demog)
questionnaire_vars = c("SEQN", "FSD032B", "FSD032C", "FSDHH")
labs_vars = c("SEQN")
exams_vars = c("SEQN")
diet_vars = c("SEQN", "DR1TSFAT", "DR1TCARB")
demog_vars = c("SEQN", "DMDEDUC2", "RIDAGEYR", "RIAGENDR")
var_list = list(questionnaire_vars, labs_vars, exams_vars, diet_vars, demog_vars)
filtered_data = lapply(1:length(data_list), function(x) data_list[[x]] %>% select(var_list[[x]]) %>% drop_na())
full_data = filtered_data %>% reduce(full_join, by = "SEQN")
head(full_data)
# Check how many observations there are
length(unique(full_data$SEQN))
full_data %>% group_by(SEQN) %>% summarize(N=n())
full_data %>% group_by(SEQN) %>% summarize(N=n()) %>% arrange(N)
full_data %>% group_by(SEQN) %>% summarize(N=n()) %>% arrange(desc(N))
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(purrr)
questionnaire = read.csv("data/questionnaire.csv")
labs = read.csv("data/labs.csv")
exams = read.csv("data/examination.csv")
diet = read.csv("data/diet.csv")
demog = read.csv("data/demographic.csv")
data_list = list(questionnaire, labs, exams, diet, demog)
questionnaire_vars = c("SEQN", "FSD032B", "FSD032C", "FSDHH")
labs_vars = c("SEQN")
exams_vars = c("SEQN")
diet_vars = c("SEQN", "DR1TSFAT", "DR1TCARB")
demog_vars = c("SEQN", "DMDEDUC2", "RIDAGEYR", "RIAGENDR")
var_list = list(questionnaire_vars, labs_vars, exams_vars, diet_vars, demog_vars)
filtered_data = lapply(1:length(data_list), function(x) data_list[[x]] %>% select(var_list[[x]]) %>% drop_na())
full_data = filtered_data %>% reduce(full_join, by = "SEQN")
head(full_data)
# Check how many observations there are
length(unique(full_data$SEQN))
full_data = filtered_data %>% reduce(full_join, by = "SEQN") %>% drop_na()
head(full_data)
# Check how many observations there are
length(unique(full_data$SEQN))
full_data = full_data %>% rename("food_didnt_last" =  "FSD032B")
full_data
setwd("~/ph241/PH241-Final")
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(purrr)
library(epitools)
library(lmtest)
library(biostat3)
library(broom)
questionnaire = read.csv("C:/Users/me/Downloads/PH241-Final-master/PH241-Final-master/data/questionnaire.csv")
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(purrr)
library(epitools)
library(lmtest)
library(biostat3)
library(broom)
questionnaire = read.csv("data/questionnaire.csv")
labs = read.csv("data/labs.csv")
exams = read.csv("data/examination.csv")
diet = read.csv("data/diet.csv")
demog = read.csv("data/demographic.csv")
data_list = list(questionnaire, labs, exams, diet, demog)
questionnaire_vars = c("SEQN", "FSD032B", "FSD032C", "FSDHH")
exams_vars = c("SEQN")
demog_vars = c("SEQN", "DMDEDUC2", "RIDAGEYR", "RIAGENDR")
var_list = list(questionnaire_vars, labs_vars, exams_vars, diet_vars, demog_vars)
filtered_data = lapply(1:length(data_list), function(x) data_list[[x]] %>% dplyr::select(var_list[[x]]))
questionnaire_vars = c("SEQN", "FSD032B", "FSD032C", "FSDHH")
labs_vars = c("SEQN", "LBXTC", "LBDLDL")
exams_vars = c("SEQN")
diet_vars = c("SEQN", "DR1TTFAT", "DR1TSFAT", "DR1TCARB")
demog_vars = c("SEQN", "DMDEDUC2", "RIDAGEYR", "RIAGENDR")
var_list = list(questionnaire_vars, labs_vars, exams_vars, diet_vars, demog_vars)
filtered_data = lapply(1:length(data_list), function(x) data_list[[x]] %>% dplyr::select(var_list[[x]]))
full_data = filtered_data %>% reduce(full_join, by = "SEQN") %>% dplyr::select("SEQN", "DR1TTFAT", "LBXTC", "FSDHH", "RIDAGEYR", "RIAGENDR", "DR1TSFAT", "DR1TCARB") %>% drop_na()
colnames(full_data) = c("id", "fat_intake", "tot_chol", "food_security", "age", "gender", "saturated_fat", "carbohydrate")
head(full_data)
# Check how many observations there are
nrow(full_data)
full_data = full_data %>% filter(tot_chol >= 160) %>%
mutate(tot_chol = ifelse(tot_chol >= 240, 1, 0))
full_data = full_data %>% mutate(food_security = food_security-1)
full_data <- full_data %>% mutate(gender = gender -1)
model1 <- glm(tot_chol ~ fat_intake * as.factor(food_security) + fat_intake *age + fat_intake * gender + fat_intake *carbohydrate + age*gender, family = binomial(link = 'logit'), data = full_data)
summary(model1)
model2 <- glm(tot_chol ~ fat_intake + age + gender + carbohydrate + as.factor(food_security) + age*gender, family = binomial(link = 'logit'), data = full_data)
summary(model2)