-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathph241_finalproj.RMD
166 lines (129 loc) · 5.05 KB
/
ph241_finalproj.RMD
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
---
title: "Exploring the Relationship between Fat Intake and Cholesterol Levels"
author: "Anna Nguyen, Allyson Ling , Bonnie Xu, Jennifer DeGuzman, Christine Le"
date: "May 10, 2020"
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(tidyverse)
library(purrr)
library(epitools)
library(lmtest)
library(biostat3)
library(broom)
```
# Load data
```{r}
questionnaire = read.csv("data/questionnaire.csv")
labs = read.csv("data/labs.csv")
exams = read.csv("data/examination.csv")
diet = read.csv("data/diet.csv")
demog = read.csv("data/demographic.csv")
data_list = list(questionnaire, labs, exams, diet, demog)
```
#Filter data to only inlude relevant variables, drop missing values
```{r}
questionnaire_vars = c("SEQN", "FSD032B", "FSD032C", "FSDHH")
labs_vars = c("SEQN", "LBXTC", "LBDLDL")
exams_vars = c("SEQN")
diet_vars = c("SEQN", "DR1TTFAT", "DR1TSFAT", "DR1TCARB")
demog_vars = c("SEQN", "DMDEDUC2", "RIDAGEYR", "RIAGENDR")
var_list = list(questionnaire_vars, labs_vars, exams_vars, diet_vars, demog_vars)
filtered_data = lapply(1:length(data_list), function(x) data_list[[x]] %>% dplyr::select(var_list[[x]]))
```
# Merge data frames
```{r}
full_data = filtered_data %>% reduce(full_join, by = "SEQN") %>% dplyr::select("SEQN", "DR1TTFAT", "LBXTC", "FSDHH", "RIDAGEYR", "RIAGENDR", "DR1TSFAT", "DR1TCARB") %>% drop_na()
colnames(full_data) = c("id", "fat_intake", "tot_chol", "food_security", "age", "gender", "saturated_fat", "carbohydrate")
head(full_data)
# Check how many observations there are
nrow(full_data)
```
-----------------------------------
Outcome = cholesterol level
- High cholesterol level: tot_chol >= 240 mg/dL
- Normal cholesterol level: 160 <= tot_chol < 240mg/dL
- Low cholesterol level: tot_chol < 160 mg/dL (values dropped from data)
-----------------------------------
Let $Y$ represent cholesterol level such that
$$Y=\begin{cases} 1, \text{if tot_chol} \geq \text{240 mg/dL} \\
0, \text{if tot_chol < 240 mg/dL} \end{cases}$$
```{r}
full_data = full_data %>% filter(tot_chol >= 160) %>%
mutate(tot_chol = ifelse(tot_chol >= 240, 1, 0))
```
-----------------------------------
Confounding = household food security level
- Full security: 0
- Marginal security: 1
- Low security: 2
- Very low security: 3
-----------------------------------
Let $Z_i$ be an indicator of household food security level i
$$Z_0 = \begin{cases} 1, \text{if Full security} \\
0, \text{else} \end{cases}$$
$$Z_1 = \begin{cases} 1, \text{if Marginal security} \\
0, \text{else} \end{cases}$$
$$Z_2 = \begin{cases} 1, \text{if Low security} \\
0, \text{else} \end{cases}$$
$$Z_3 = \begin{cases} 1, \text{if Very Low security} \\
0, \text{else} \end{cases}$$
Addionally, let $Z$ represent household food security level such that
$$Z=\begin{cases} 0, \text{if Full security} \\
1, \text{if Marginal security} \\
2, \text{if Low security} \\
3, \text{if Very low security} \end{cases}$$
```{r}
full_data = full_data %>% mutate(food_security = food_security-1)
```
-----------------------
Possible third variabe: Age
$$A = \text{age at interview, as a continuous variable} $$
----------------------
----------------------
Possible third variable : Gender
Self-reported Gender at Interview
$$ G = \begin{cases} 1, \text{Female} \\
0, \text{Male} \end{cases}
$$
```{r}
full_data <- full_data %>% mutate(gender = gender -1)
```
#Model 1
```{r}
model1 <- glm(tot_chol ~ fat_intake * as.factor(food_security) + fat_intake *age + fat_intake * gender + fat_intake *carbohydrate + age*gender, family = binomial(link = 'logit'), data = full_data)
summary(model1)
```
#Model 2 model without significant interaction
```{r}
model2 <- glm(tot_chol ~ fat_intake + age + gender + carbohydrate + age*gender + fat_intake*carbohydrate, family = binomial(link = 'logit'), data = full_data)
summary(model2)
lrtest(model1, model2)
```
#Model 3 model without significant confounders
```{r}
model3 <- glm(tot_chol ~ fat_intake + age + gender + age*gender, family = binomial(link = 'logit'), data = full_data)
summary(model3)
lrtest(model2, model3)
```
#Model Visualizations
```{r}
full_data$pred_risk <- predict(model3, type = "response", newdata = full_data)
full_data$Gender = as.factor(full_data$gender)
levels(full_data$Gender) = c("Male","Female")
full_data <- full_data %>%
mutate(pred_odds = pred_risk/(1-pred_risk)) %>%
mutate(pred_log_odds = log(pred_odds))
ggplot(full_data, aes(x = age, y = pred_log_odds, color = Gender)) +
geom_point() +
xlab("Age") +
ylab("Predicted Log Odds of Total Cholesterol") +
ggtitle("Age vs. Total Cholesterol by Gender")
ggplot(full_data, aes(x = fat_intake, y = pred_log_odds, color = Gender)) +
geom_point() +
xlab("Total Fat Intake") +
ylab("Predicted Log Odds of Total Cholesterol") +
ggtitle("Total Fat Intake vs. Total Cholesterol by Gender")
```