-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_BRFSS_dataset.Rmd
157 lines (139 loc) · 5.55 KB
/
get_BRFSS_dataset.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
---
title: "R Notebook"
output: html_notebook
---
# Generate BRFSS Dataset
Read in entire BRFSS dataset, subsample, and process select columns
Details about BRFSS dataset: https://www.cdc.gov/brfss/annual_data/annual_2016.html
The original XPT dataset must be downloaded from here:
https://www.cdc.gov/brfss/annual_data/2016/files/LLCP2016XPT.zip
Note that this is an optional step. Processed data is in `dat/BRFSS.48K.csv`, or you can obtain the processed dataset with,
```
data.fn<-"https://raw.githubusercontent.com/kijohnson/Advanced-Data-Analysis/master/Class%204%20Data%20Visualization/BRFSS.48K.csv"
BRFSS<-read.csv(data.fn)
```
###Load the oreign library to access the read.xport function
```{r}
library(foreign)
```
###Read in the BRFSS SAS dataset
```{r}
BRFSS.fn<-"../dat/LLCP2016.XPT" # this may need to be adjusted
BRFSS2016<-read.xport(BRFSS.fn)
```
###Take a 10% random sample of the BRFSS2016 dataset and list all the variables
```{r}
BRFSS2016_10Percent<-BRFSS2016[sample(1:nrow(BRFSS2016), 48630, replace=FALSE),]
varlist<-colnames(BRFSS2016_10Percent)
varlist
```
###select variables of interest to make the size manageable for the class demonstration
```{r}
myvars<-c("X_STATE", "EMPLOY1", "INCOME2", "SEATBELT", "DIABETE3", "X_BMI5", "HTM4", "WTKG3", "SEX", "X_AGE80")
BRFSS_samp<-BRFSS2016_10Percent[myvars]
```
###Look at State variable
```{r}
class(BRFSS_samp$X_STATE) #first find class
table(BRFSS_samp$X_STATE) #See how many in each number
```
###Assign labels to values of state (code book can be accessed here: https://www.cdc.gov/brfss/annual_data/2016/pdf/codebook16_llcp.pdf)
```{r}
BRFSS_samp$state<-factor(BRFSS_samp$X_STATE,
levels=c(1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,
37,38,39,40,41,42,44,45,46,47,48,49,50,51,53,54,55,56,66,72,78),
labels=c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO","MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "GUAM", "PR", "VI"))
table(BRFSS_samp$state)
```
###Quality control check to make sure numbers of subjects for X_State correspond with what is reported here https://www.cdc.gov/brfss/annual_data/2016/pdf/codebook16_llcp.pdf in the X_STATE variable
```{r}
BRFSS2016$state<-factor(BRFSS2016$X_STATE,
levels=c(1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,
37,38,39,40,41,42,44,45,46,47,48,49,50,51,53,54,55,56,66,72,78),
labels=c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY", "GUAM", "PR", "VI"))
table(BRFSS2016$state) #numbers cross check
```
###Label other variables that we may use: EMPLOY1, INCOME2, SEATBELT, BMI5, DIABETE3 (code book can be accessed here: https://www.cdc.gov/brfss/annual_data/2016/pdf/codebook16_llcp.pdf)
```{r}
BRFSS_samp$employed <- factor(
BRFSS_samp$EMPLOY1,
levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 99),
labels = c("Employed for wages", "Self-employed", "Out of work for 1 year or more", "Out of work for less than 1 year",
"A homemaker", "A student","Retired","Unable to work","Refused","Not asked or Missing")
)
table(BRFSS_samp$employed)
BRFSS_samp$income <- factor(
BRFSS_samp$INCOME2,
levels = c(1, 2, 3, 4, 5, 6, 7, 8, 77, 99, "BLANK"),
labels = c(
"< $10,000",
"$10,000 to less than $15,000",
"$15,000 to less than $20,000",
"$20,000 to less than $25,000",
"$25,000 to less than $35,000",
"$35,000 to less than $50,000",
"$50,000 to less than $75,000",
"$75,000 or more", "Don't know/Not sure",
"Refused","Not asked or missing")
)
table(BRFSS_samp$income)
BRFSS_samp$seatbelt <- factor(
BRFSS_samp$SEATBELT,
levels = c(1, 2, 3, 4, 5, 7, 8, 9),
labels = c(
"Always",
"Nearly always",
"Sometimes",
"Seldom",
"Never",
"Don't know/Not sure",
"Never drive or ride in a care",
"Refused")
)
table(BRFSS_samp$seatbelt)
class(BRFSS_samp$X_BMI5)
BRFSS_samp$diabetes <- factor(
BRFSS_samp$DIABETE3,
levels = c(1, 2, 3, 4, 7, 9),
labels = c(
"Yes",
"Yes, but female told only during pregnancy",
"No",
"No, pre-diabetes or borderline diabetes",
"Don't know/Not Sure",
"Refused")
)
BRFSS_samp$diabetes_short_label <- factor(
BRFSS_samp$DIABETE3,
levels = c(1, 2, 3, 4, 7, 9),
labels = c(
"Yes",
"Yes pregnancy",
"No",
"pre/borderline",
"Unknown",
"Refused")
)
#table(BRFSS_samp$diabetes)
BRFSS_samp$sex <- factor(
BRFSS_samp$SEX,
levels = c(1, 2, 9),
labels = c("Male", "Female", "Refused")
)
table(BRFSS_samp$employed)
# Define height and weight columns in units of m and kg
BRFSS_samp$height <- BRFSS_samp$HTM4/100
BRFSS_samp$weight <- BRFSS_samp$WTKG3/100
BRFSS_samp$bmi <- BRFSS_samp$X_BMI5/100
BRFSS_samp$age <- BRFSS_samp$X_AGE80 # really just rename it
```
Delete columns which have been converted
Write csv file for Matt for class demo
```{r}
drops <- c("X_STATE", "EMPLOY1", "INCOME2", "SEATBELT", "DIABETE3", "X_BMI5", "HTM4", "WTKG3", "SEX", "X_AGE80")
BRFSS_samp <- BRFSS_samp[, !(names(BRFSS_samp) %in% drops)]
```
```{r}
write.csv(BRFSS_samp, "dat/BRFSS.48K.csv", row.names=FALSE, quote=TRUE)
write.table(BRFSS_samp, "dat/BRFSS.48K.tsv", row.names=FALSE, quote=FALSE, sep="\t")
```