-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_datasets.R
82 lines (64 loc) · 2.59 KB
/
create_datasets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
##################################################################
#
# This script creates income and shapefiles datasets and saves them
# as R objects. This allows for quicker import
#
##################################################################
library(tidyverse)
library(DBI)
library(data.table)
library(tigris)
#library(gridExtra)
#library(grid)
source('palma_functions.R')
# connect to PUMS database
con <- dbConnect(RSQLite::SQLite(), "pums_db.db")
### create dataset of PUMA level incomes
### used for plotting income distributions
# import household tax liabilities for the given year
taxes <- readRDS('nc_tax_liability.Rda')
# initialize dataframe
incomes <- data.frame()
for (yr in seq(2015, 2016, 2017)) {
incomes <- household_incomes_dist(con, yr, 37, taxes) %>%
bind_rows(incomes, .)
}
# don't need serial number or state, since state is NC for all
incomes <- incomes %>%
filter(pre_tax_income >= 500) %>%
select(-SERIALNO, -ST, -pre_tax_income)
# adjust incomes for inflation
incomes_post <- inflation_adjust(incomes, wages_col = post_tax_income, year_adjust = 2017, error = FALSE) %>%
select(post_tax_income)%>%
.[[1]]
# add inflation adjusted columns to dataset
incomes$post_tax_income <- incomes_post
# replace PUMA number with name
# import list of PUMA numbers and names, and merge with income dataset
puma_names <- read_csv('puma_counties.csv') %>%
select(puma12, PUMA12name) %>%
distinct()
# vector of counties that have PUMAs all within the county
# allows us to filter for PUMA by county
counties <- c('Alamance', 'Brunswick', 'Buncombe', 'Cabarrus', 'Catawba', 'Craven', 'Cumberland', 'Davidson',
'Durham', 'Forsyth', 'Gaston', 'Guilford', 'Harnett', 'Iredell', 'Johnston', 'Mecklenburg',
'Orange', 'Pitt', 'Randolph', 'Rowan', 'Wake', 'Wayne')
incomes <- left_join(incomes, puma_names, by = c('PUMA' = 'puma12')) %>%
# remove PUMA numbers
select(-PUMA) %>%
# rename puma description column
rename(PUMA = PUMA12name) %>%
# only keep counties that have PUMAs filly within counties
filter(group %in% !!counties)
# saveRDS(incomes, file="incomes.Rda")
# read in full palma data, but only keep needed counties
palma_full <- read_csv('palmas_full.csv') %>%
mutate_at(vars(estimate, se, moe, cv), funs(round(., 2))) %>%
filter(!(type == 'county' & !(subtype %in% counties))) %>%
select(-se, -cv)
# saveRDS(palma_full, 'palma_full.Rda')
# import shapefiles and create datasets
nc_puma <- pumas(state = 'NC', cb = TRUE)
# saveRDS(nc_puma, 'nc_puma.Rda')
nc_county <- counties(state = 'NC', cb = TRUE)
# saveRDS(nc_county, 'nc_county.Rda')