diff --git a/.nojekyll b/.nojekyll index da9d188..f7c8040 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -d1a4c137 \ No newline at end of file +334f3c1f \ No newline at end of file diff --git a/final.html b/final.html index adbfc4c..84be0d7 100644 --- a/final.html +++ b/final.html @@ -20,6 +20,40 @@ width: 0.8em; margin: 0 0.8em 0.2em -1em; vertical-align: middle; } + +pre > code.sourceCode { white-space: pre; position: relative; } +pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } +pre > code.sourceCode > span:empty { height: 1.2em; } +.sourceCode { overflow: visible; } +code.sourceCode > span { color: inherit; text-decoration: inherit; } +div.sourceCode { margin: 1em 0; } +pre.sourceCode { margin: 0; } +@media screen { +div.sourceCode { overflow: auto; } +} +@media print { +pre > code.sourceCode { white-space: pre-wrap; } +pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } +} +pre.numberSource code +{ counter-reset: source-line 0; } +pre.numberSource code > span +{ position: relative; left: -4em; counter-increment: source-line; } +pre.numberSource code > span > a:first-child::before +{ content: counter(source-line); +position: relative; left: -1em; text-align: right; vertical-align: baseline; +border: none; display: inline-block; +-webkit-touch-callout: none; -webkit-user-select: none; +-khtml-user-select: none; -moz-user-select: none; +-ms-user-select: none; user-select: none; +padding: 0 4px; width: 4em; +} +pre.numberSource { margin-left: 3em; padding-left: 4px; } +div.sourceCode +{ } +@media screen { +pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } +} @@ -9169,7 +9203,7 @@

Table of contents

SmartZoning® Documentation

-

Leveraging Permitting and Zoning Data to Predict Upzoning Pressure, Philadelphia

+

Leveraging Permitting and Zoning Data to Predict Upzoning Pressure in Philadelphia

@@ -9205,7 +9239,51 @@

2 Overview

-

The following documentation details the development of a predictive model, which has demonstrated remarkable effectiveness in predicting future development patterns with a low mean absolute error. By accurately forecasting where growth is likely to occur using past permitting data against where current zoning may hinder growth, this model serves as a critical backbone to SmartZoning’s functionality. The study also considers the relationship between development pressure with race, income, and housing cost burden to strengthen the predictive model and investigate the impacts of development locally and city-wide.

+

Below, we outline how we developed a predictive model that effectively forecasts future annual development patterns with a low mean absolute error. This model is the basis of SmartZoning: by anticipating future development, it can highly where current zoning may hinder smart growth. We also consider the relationship between development pressure and race, income, and housing cost burden, demonstrating the generalizability of our model across different socioeconomic contexts and investigation the impacts of development locally and city-wide.

+
+
+Show the code +
required_packages <- c("tidyverse", "sf", "acs", "tidycensus", "sfdep", "kableExtra", "conflicted",
+                       "gganimate", "tmap", "gifski", "transformr", "ggpubr", "randomForest", "janitor",
+                       'igraph', "plotly", "ggcorrplot", "Kendall", "car", "shiny", "leaflet",
+                       "classInt")
+suppressWarnings(
+install_and_load_packages(required_packages)
+)
+
+source("utils/viz_utils.R")
+
+
+
+urls <- c(
+  roads = 'https://opendata.arcgis.com/datasets/261eeb49dfd44ccb8a4b6a0af830fdc8_0.geojson', # for broad and market
+  council_dists = "https://opendata.arcgis.com/datasets/9298c2f3fa3241fbb176ff1e84d33360_0.geojson",
+  building_permits = building_permits_path,
+  permits_bg = final_dataset_path,
+  zoning = "https://opendata.arcgis.com/datasets/0bdb0b5f13774c03abf8dc2f1aa01693_0.geojson",
+  opa = "data/opa_properties.geojson",
+  ols_preds = 'data/model_outputs/ols_preds.geojson',
+  rf_test_preds = 'data/model_outputs/rf_test_preds.geojson',
+  rf_val_preds = 'data/model_outputs/rf_val_preds.geojson',
+  rf_proj_preds = 'data/model_outputs/rf_proj_preds.geojson'
+  
+)
+
+suppressMessages({
+  invisible(
+    imap(urls, ~ assign(.y, phl_spat_read(.x), envir = .GlobalEnv))
+  )
+})
+
+broad_and_market <- roads %>% filter(ST_NAME %in% c('BROAD',"MARKET") | SEG_ID %in% c(440370, 421347,421338,421337,422413,423051,440403,440402,440391,440380))
+
+council_dists <- council_dists %>%
+                    select(DISTRICT)
+
+building_permits <- building_permits %>%
+                      filter(permittype %in% c("RESIDENTIAL BUILDING", "BP_ADDITION", "BP_NEWCNST"))
+
+

3 Select and Engineer Features

@@ -9249,6 +9327,22 @@

3.1 Permits

Firstly, 10 years of permit data from 2012 to 2023 from the Philadelphia Department of Licenses and Inspections are critical to the study. This study filters only for new construction permits granted for residential projects. In the future, filtering for full and substantial renovations could add more nuance to what constitutes as development pressure.

+
+
+Show the code +
tm <- tmap_theme(tm_shape(permits_bg %>% filter(!year %in% c(2012, 2024))) +
+        tm_polygons(col = "permits_count", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_facets(along = "year") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "New Construction Permits per Year\nPhiladelphia, PA")
+
+suppressMessages(
+tmap_animation(tm, "assets/permits_animation.gif", delay = 50)
+)
+
+

@@ -9258,9 +9352,38 @@

expiration of a tax abatement program for developers.

When assessing new construction permit count by Council Districts, a few districts issued the bulk of new permits during that 2021 peak. Hover over the lines to see more about the volume of permits and who granted them.

+
+Show the code +
perms_x_dist <- st_join(building_permits, council_dists)
+
+perms_x_dist_sum <- perms_x_dist %>%
+                  st_drop_geometry() %>%
+                  group_by(DISTRICT, year) %>%
+                  summarize(permits_count = n())
+
+perms_x_dist_mean = perms_x_dist_sum %>%
+                      group_by(year) %>%
+                      summarize(permits_count = mean(permits_count)) %>%
+                      mutate(DISTRICT = "Average")
+
+perms_x_dist_sum <- bind_rows(perms_x_dist_sum, perms_x_dist_mean) %>%
+                        mutate(color = ifelse(DISTRICT != "Average", 0, 1))
+
+ggplotly(
+ggplot(perms_x_dist_sum %>% filter(year > 2013, year < 2024), aes(x = year, y = permits_count, color = as.character(color), group = interaction(DISTRICT, color))) +
+  geom_line(lwd = 0.7) +
+  labs(title = "Permits per Year by Council District",
+       y = "Total Permits") +
+  # facet_wrap(~DISTRICT) +
+  theme_minimal() +
+  theme(axis.title.x = element_blank(),
+        legend.position = "none") +
+  scale_color_manual(values = c(palette[5], palette[1]))
+)
+
-
- +
+

@@ -9268,6 +9391,24 @@

3.2 Feature Engineering by Time and Space

New construction exhibits sizable spatial and temporal autocorrelation. In other words, there is a strong relationship between the number of permits in a given block group and the number of permits in neighboring block groups; as well as between the number of permits issued in a block group in a given year and the number of permits issued in that same block group in the previous year. To account for these relationships, we engineer new features, including both space and time lags. We note that all of these engineered features have strong correlation coefficients with our dependent variable, permits_count, and p-values indicating that these relationships are statistically significant.

+
+Show the code +
permits_bg_long <- permits_bg %>%
+                    filter(!year %in% c(2024)) %>%
+                    st_drop_geometry() %>%
+                    pivot_longer(
+                      cols = c(starts_with("lag"), dist_to_2022),
+                      names_to = "Variable",
+                      values_to = "Value"
+                    )
+
+
+ggscatter(permits_bg_long, x = "permits_count", y = "Value", facet.by = "Variable",
+   add = "reg.line",
+   add.params = list(color = palette[3], fill = palette[5]),
+   conf.int = TRUE
+   ) + stat_cor(method = "pearson", p.accuracy = 0.001, r.accuracy = 0.01)
+

@@ -9278,6 +9419,31 @@

Racial Mix (white vs non-white), median income, and housing cost burden are socioeconomic factors that often play an outsized role in affordability in cities like Philadelphia, with a pervasive and persistent history of housing discrimination and systemic disinvestment. This data is all pulled from the US Census Bureau’s American Community Survey 5-Year survey.

Spatially, is clear that non-white communities earn lower median incomes and experience higher rates of extreme rent burden (household spends more than 35% of income on gross rent).

+
+Show the code +
med_inc <- tmap_theme(tm_shape(permits_bg %>% filter(year == 2022)) +
+        tm_polygons(col = "med_inc", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Med. Inc. ($)") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "Median Income")
+  
+race <- tmap_theme(tm_shape(permits_bg %>% filter(year == 2022)) +
+        tm_polygons(col = "percent_nonwhite", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Nonwhite (%)") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "Race")
+  
+rent_burd <- tmap_theme(tm_shape(permits_bg %>% filter(year == 2022)) +
+        tm_polygons(col = "ext_rent_burden", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Rent Burden (%)") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "Extreme Rent Burden")
+  
+tmap_arrange(med_inc, race, rent_burd)
+

@@ -9297,6 +9463,28 @@

4.1.1 Correlation Coefficients

+
+Show the code +
corr_vars <- c("total_pop",
+               "med_inc",
+               "percent_nonwhite",
+               "percent_renters",
+               "rent_burden",
+               "ext_rent_burden")
+
+corr_dat <- permits_bg %>% select(all_of(corr_vars), permits_count) %>% select(where(is.numeric)) %>% st_drop_geometry() %>% unique() %>% na.omit()
+
+corr <- round(cor(corr_dat), 2)
+p.mat <- cor_pmat(corr_dat)
+
+ggcorrplot(corr, p.mat = p.mat, hc.order = TRUE,
+    type = "full", insig = "blank", lab = TRUE, colors = c(palette[2], "white", palette[3])) +
+  annotate(
+  geom = "rect",
+  xmin = .5, xmax = 7.5, ymin = 4.5, ymax = 5.5,
+  fill = "transparent", color = "red", alpha = 0.5
+)
+

@@ -9308,6 +9496,16 @@

+
+Show the code +
ols <- lm(permits_count ~ ., data = permits_bg %>% filter(year < 2024) %>% select(-c(mapname, geoid10, year)) %>% st_drop_geometry())
+vif(ols) %>%
+  data.frame() %>%
+  clean_names() %>%
+  select(-df) %>%
+  arrange(desc(gvif)) %>%
+  kablerize()
+
@@ -9754,9 +9952,31 @@

+
+Show the code +
ggplot(building_permits %>% filter(!year %in% c(2024)), aes(x = as.factor(year))) +
+  geom_bar(fill = palette[1], color = NA, alpha = 0.7) +
+  labs(title = "Permits per Year",
+       y = "Count") +
+  theme_minimal() +
+  theme(axis.title.x = element_blank(),
+        aspect.ratio = .75)
+

+
+Show the code +
ggplot(permits_bg %>% st_drop_geometry %>% filter(!year %in% c(2024)), aes(x = permits_count)) +
+  geom_histogram(fill = palette[1], color = NA, alpha = 0.7) +
+  labs(title = "Permits per Block Group per Year",
+       subtitle = "Log-Transformed",
+       y = "Count") +
+  scale_x_log10() +
+  facet_wrap(~year) +
+  theme_minimal() +
+  theme(axis.title.x = element_blank())
+

@@ -9767,11 +9987,64 @@

4.2 Examine Spatial Patterns

To to identify spatial clusters, or hotspots, in geographic data, we performed a Local Moran’s I test. It assesses the degree of spatial autocorrelation, which is the extent to which the permit counts in a block group tend to be similar to neighboring block group. We used a p-value of 0.1 as our hotspot threshold.

+
+Show the code +
lisa <- permits_bg %>% 
+  filter(year == 2023) %>%
+  mutate(nb = st_contiguity(geometry),
+                         wt = st_weights(nb),
+                         permits_lag = st_lag(permits_count, nb, wt),
+          moran = local_moran(permits_count, nb, wt)) %>% 
+  tidyr::unnest(moran) %>% 
+  mutate(pysal = ifelse(p_folded_sim <= 0.1, as.character(pysal), NA),
+         hotspot = case_when(
+           pysal == "High-High" ~ "Yes",
+           TRUE ~ "No"
+         ))
+
+# 
+# palette <- c("High-High" = "#B20016", 
+#              "Low-Low" = "#1C4769", 
+#              "Low-High" = "#24975E", 
+#              "High-Low" = "#EACA97")
+
+morans_i <- tmap_theme(tm_shape(lisa) +
+  tm_polygons(col = "ii", border.alpha = 0, style = "jenks", palette = mono_5_green, title = "Moran's I"),
+  "Local Moran's I")
+
+p_value <- tmap_theme(tm_shape(lisa) +
+  tm_polygons(col = "p_ii", border.alpha = 0, style = "jenks", palette = mono_5_green, title = "P-Value"),
+  "Moran's I P-Value")
+
+sig_hotspots <- tmap_theme(tm_shape(lisa) +
+  tm_polygons(col = "hotspot", border.alpha = 0, style = "cat", palette = c(mono_5_green[1], mono_5_green[5]), textNA = "Not a Hotspot", title = "Hotspot?"),
+  "New Construction Hotspots")
+
+tmap_arrange(morans_i, p_value, sig_hotspots, ncol = 3)
+
-

+

Emergeging hotspots…? If I can get it to work.

+
+
+Show the code +
# stc <- as_spacetime(permits_bg %>% select(permits_count, geoid10, year) %>% na.omit(),
+#                  .loc_col = "geoid10",
+#                  .time_col = "year")
+# 
+# # conduct EHSA
+# ehsa <- emerging_hotspot_analysis(
+#   x = stc,
+#   .var = "permits_count",
+#   k = 1,
+#   nsim = 5
+# )
+# 
+# count(ehsa, classification)
+
+

4.3 Compare Models

@@ -9782,15 +10055,59 @@

4.3.1 OLS

OLS (Ordinary least squares) is a method to explore relationships between a dependent variable and one or more explanatory variables. It considers the strength and direction of these relationships and the goodness of model fit. Our model incorporates three engineered groups of features: space lag, time lag, and distance to 2022. We include this last variable because of the Philadelphia tax abatement policy that led to a significant increase in residential development in the years immediately before 2022 discussed earlier. We used this as a baseline model to compare to Poisson and Random Forest. Given how tightly aligned the observed and predicted prices are we performed dozens of variable combinations to rule out over fitting. We are confident that our variables are generalizable and do not over-fit.

+
+Show the code +
suppressMessages(
+ggplot(ols_preds, aes(x = permits_count, y = ols_preds)) +
+  geom_point() +
+  labs(title = "Predicted vs. Actual Permits: OLS",
+       subtitle = "2022 Data",
+       x = "Actual Permits",
+       y = "Predicted Permits") +
+  geom_abline() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  theme_minimal()
+)
+

+
+Show the code +
ggplot(ols_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], color = NA, alpha = 0.7) +
+  labs(title = "Distribution of Absolute Error per Block Group",
+       subtitle = "OLS, 2022") +
+  theme_minimal()
+

+
+Show the code +
ols_mae <- paste0("MAE: ", round(mean(ols_preds$abs_error, na.rm = TRUE), 2))
+

Our OLS model exhibits a Mean Absolute Error (MAE) of 2.66, a decent performance for a model of its simplicity. However, its efficacy is notably diminished in critical domains where optimization is imperative. Consequently, we intend to enhance the predictive capacity by incorporating more pertinent variables and employing a more sophisticated modeling approach.

+
+Show the code +
ols_preds_map <- tmap_theme(tm_shape(ols_preds) +
+        tm_polygons(col = "ols_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Predicted Permits: OLS")
+
+ols_error_map <- tmap_theme(tm_shape(ols_preds) +
+        tm_polygons(col = "abs_error", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Absolute Error") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Absolute Error: OLS")
+
+tmap_arrange(ols_preds_map, ols_error_map)
+

@@ -9802,15 +10119,59 @@

OLS and Random Forest represent different modeling paradigms. OLS is a linear regression model suitable for capturing linear relationships, while Random Forest is an ensemble method capable of capturing non-linear patterns and offering greater flexibility in handling various data scenarios. Considering, Random Forest is generally less sensitive to multicollinearity because it considers subsets of features in each tree and averages their predictions and because the effect of outliers tends to be mitigated, we decided it worth investigating Random Forest as an alternative model.

Compared to the OLS model, the relationship between predicted vs actual permits…

+
+Show the code +
ggplot(rf_test_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], alpha = 0.7, color = NA) +
+  labs(title = "Distribution of Absolute Error per Block Group",
+       subtitle = "Random Forest, 2022") +
+  theme_minimal()
+

+
+Show the code +
suppressMessages(
+ggplot(rf_test_preds, aes(x = permits_count, y = rf_test_preds)) +
+  geom_point() +
+  labs(title = "Predicted vs. Actual Permits: RF",
+       subtitle = "2022 Data",
+       x = "Actual Permits",
+       y = "Predicted Permits") +
+  geom_abline() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  theme_minimal()
+)
+

+
+Show the code +
rf_test_mae <- paste0("MAE: ", round(mean(rf_test_preds$abs_error, na.rm = TRUE), 2))
+

Compared to the OLS Model, the Random Forest Model has a similar error distribution however, it exhibits a MAE of….

+
+Show the code +
test_preds_map <- tmap_theme(tm_shape(rf_test_preds) +
+        tm_polygons(col = "rf_test_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Predicted Permits: RF Test")
+
+test_error_map <- tmap_theme(tm_shape(rf_test_preds) +
+        tm_polygons(col = "abs_error", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Absolute Error") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Absolute Error: RF Test") 
+
+tmap_arrange(test_preds_map, test_error_map)
+

@@ -9826,15 +10187,59 @@

We train and test up to 2022–we use this for model tuning and feature engineering.

Having settled on our model features and tuning, we now validate on 2023 data.

+
+Show the code +
val_preds_map <- tmap_theme(tm_shape(rf_val_preds) +
+        tm_polygons(col = "rf_val_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Predicted Permits: RF Validate")
+
+val_error_map <- tmap_theme(tm_shape(rf_val_preds) +
+        tm_polygons(col = "abs_error", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Absolute Error") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Absolute Error: RF Validate")
+
+tmap_arrange(val_preds_map, val_error_map)
+

+
+Show the code +
ggplot(rf_val_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], alpha = 0.7, color = NA) +
+  labs(title = "Distribution of Absolute Error per Block Group",
+       subtitle = "Random Forest, 2023") +
+  theme_minimal()
+

+
+Show the code +
suppressMessages(
+ggplot(rf_val_preds, aes(x = permits_count, y = rf_val_preds)) +
+  geom_point() +
+  labs(title = "Predicted vs. Actual Permits: RF",
+       subtitle = "2023 Data",
+       x = "Actual Permits",
+       y = "Predicted Permits") +
+  geom_abline() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  theme_minimal()
+)
+

+
+Show the code +
rf_val_mae <- paste0("MAE: ", round(mean(rf_val_preds$abs_error, na.rm = TRUE), 2))
+

We return an MAE of MAE: 2.19.

@@ -9844,6 +10249,16 @@

6.1 Accuracy

Predominately, our model overpredicts, which is better than underpredicting, as it facilitates new development.

+
+Show the code +
nbins <- as.integer(sqrt(nrow(rf_val_preds)))
+vline <- mean(rf_val_preds$abs_error, na.rm = TRUE)
+
+ggplot(rf_val_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], alpha = 0.7, fill = NA, bins = nbins) +
+  geom_vline(aes(xintercept = vline)) +
+  theme_minimal()
+

@@ -9853,6 +10268,20 @@

6.2 Generalizabiltiy

The constructed boxplot, categorizing observations based on racial composition, indicates that the random forest model generalizes effectively, showcasing consistent and relatively low absolute errors across majority non-white and majority white categories. The discernible similarity in error distributions suggests that the model’s predictive performance remains robust across diverse racial compositions, affirming its ability to generalize successfully.

+
+Show the code +
rf_val_preds <- rf_val_preds %>%
+                      mutate(race_comp = case_when(
+                        percent_nonwhite >= .50 ~ "Majority Non-White",
+                        TRUE ~ "Majority White"
+                      ))
+
+ggplot(rf_val_preds, aes(y = abs_error, color = race_comp)) +
+  geom_boxplot(fill = NA) +
+  scale_color_manual(values = c(mono_5_orange[5], mono_5_orange[3])) +
+ # labs(title = "C")
+  theme_minimal()
+

@@ -9860,12 +10289,44 @@

We find that error is not related to affordability and actually trends downward with percent nonwhite. (This is probably because there is less total development happening there in majority-minority neighborhoods to begin with, so the magnitude of error is less, even though proportionally it might be more.) Error increases slightly with total pop. This makes sense–more people –> more development.

Our analysis reveals that the error is not correlated with affordability and demonstrates a downward trend in conjunction with the percentage of the nonwhite population. This observed pattern may be attributed to the likelihood that majority-minority neighborhoods experience a comparatively lower volume of overall development, thereby diminishing the absolute magnitude of error, despite potential proportional increases. Additionally, there is a slight increase in error with the total population, aligning with the intuitive expectation that higher population figures correspond to more extensive development activities.

+
+Show the code +
rf_val_preds_long <- rf_val_preds %>%
+  pivot_longer(cols = c(rent_burden, percent_nonwhite, total_pop, med_inc),
+               names_to = "variable", values_to = "value") %>%
+  mutate(variable = case_when(
+    variable == "med_inc" ~ "Median Income ($)",
+    variable == "percent_nonwhite" ~ "Nonwhite (%)",
+    variable == "rent_burden" ~ "Rent Burden (%)",
+    TRUE ~ "Total Pop."
+  ))
+
+ggplot(rf_val_preds_long, aes(x = value, y = abs_error)) +
+  geom_point() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  facet_wrap(~ variable, scales = "free_x") +
+  labs(title = "Generalizability of Absolute Error",
+       x = "Value",
+       y = "Absolute Error") +
+  theme_minimal()
+

How does this generalize across council districts?

+
+Show the code +
suppressMessages(
+  ggplot(rf_val_preds, aes(x = reorder(district, abs_error, FUN = mean), y = abs_error)) +
+    geom_boxplot(fill = NA, color = palette[3]) +
+    labs(title = "MAE by Council District",
+         y = "Mean Absolute Error",
+         x = "Council District") +
+    theme_minimal()
+)
+

@@ -9877,20 +10338,102 @@

+
+Show the code +
filtered_zoning <- zoning %>%
+                     filter(str_detect(CODE, "RS") | str_detect(CODE, "I"),
+                            CODE != "I2",
+                            !str_detect(CODE, "SP")) %>%
+                     st_join(., rf_val_preds %>% select(rf_val_preds))
+
+
+
+zoning_map <- tmap_theme(tm_shape(filtered_zoning) +
+        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey", title = "Zoning Code") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Mismatched Zoning")
+  
+mismatch <- tmap_theme(tm_shape(filtered_zoning) +
+        tm_polygons(col = "rf_val_preds", border.alpha = 0, colorNA = "lightgrey", palette = mono_5_orange, style = "fisher", title = "Predicted New Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Development Pressure")
+
+tmap_arrange(zoning_map, mismatch)
+

We can extract development predictions at the block level to these parcels and then visualize them by highest need.

+
+Show the code +
tmap_mode('view')
+
+filtered_zoning %>%
+  filter(rf_val_preds > 10) %>%
+tm_shape() +
+        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey",
+                    popup.vars = c('rf_val_preds', 'CODE')) +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE)
+
-
- +
+

Furthermore, we can identify properties with high potential for assemblage, which suggests the ability to accomodate high-density, multi-unit housing.

+
+Show the code +
nbs <- filtered_zoning %>% 
+  mutate(nb = st_contiguity(geometry))
+
+# Create edge list while handling cases with no neighbors
+edge_list <- tibble::tibble(id = 1:length(nbs$nb), nbs = nbs$nb) %>% 
+  tidyr::unnest(nbs) %>% 
+  filter(nbs != 0)
+
+# Create a graph with a node for each row in filtered_zoning
+g <- make_empty_graph(n = nrow(filtered_zoning))
+V(g)$name <- as.character(1:nrow(filtered_zoning))
+
+# Add edges if they exist
+if (nrow(edge_list) > 0) {
+  edges <- as.matrix(edge_list)
+  g <- add_edges(g, c(t(edges)))
+}
+
+# Calculate the number of contiguous neighbors, handling nodes without neighbors
+n_contiguous <- sapply(V(g)$name, function(node) {
+  if (node %in% edges) {
+    length(neighborhood(g, order = 1, nodes = as.numeric(node))[[1]])
+  } else {
+    1  # Nodes without neighbors count as 1 (themselves)
+  }
+})
+
+filtered_zoning <- filtered_zoning %>%
+                    mutate(n_contig = n_contiguous)
+
+filtered_zoning %>%
+  st_drop_geometry() %>%
+  select(rf_val_preds,
+         n_contig,
+         OBJECTID,
+         CODE) %>%
+  filter(rf_val_preds > 10,
+         n_contig > 2) %>%
+  arrange(desc(rf_val_preds)) %>%
+  kablerize(caption = "Poorly-Zoned Properties with High Development Risk")
+

@@ -10223,6 +10766,17 @@

8 2024 Predictions

+
+Show the code +
tmap_mode('plot')
+
+tmap_theme(tm_shape(rf_proj_preds) +
+        tm_polygons(col = "rf_proj_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Predicted New Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Projected New Development, 2024")
+

@@ -10230,7 +10784,7 @@

9 Web Application

- +

10 Next Steps

diff --git a/index.html b/index.html index adbfc4c..84be0d7 100644 --- a/index.html +++ b/index.html @@ -20,6 +20,40 @@ width: 0.8em; margin: 0 0.8em 0.2em -1em; vertical-align: middle; } + +pre > code.sourceCode { white-space: pre; position: relative; } +pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } +pre > code.sourceCode > span:empty { height: 1.2em; } +.sourceCode { overflow: visible; } +code.sourceCode > span { color: inherit; text-decoration: inherit; } +div.sourceCode { margin: 1em 0; } +pre.sourceCode { margin: 0; } +@media screen { +div.sourceCode { overflow: auto; } +} +@media print { +pre > code.sourceCode { white-space: pre-wrap; } +pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } +} +pre.numberSource code +{ counter-reset: source-line 0; } +pre.numberSource code > span +{ position: relative; left: -4em; counter-increment: source-line; } +pre.numberSource code > span > a:first-child::before +{ content: counter(source-line); +position: relative; left: -1em; text-align: right; vertical-align: baseline; +border: none; display: inline-block; +-webkit-touch-callout: none; -webkit-user-select: none; +-khtml-user-select: none; -moz-user-select: none; +-ms-user-select: none; user-select: none; +padding: 0 4px; width: 4em; +} +pre.numberSource { margin-left: 3em; padding-left: 4px; } +div.sourceCode +{ } +@media screen { +pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } +} @@ -9169,7 +9203,7 @@

Table of contents

SmartZoning® Documentation

-

Leveraging Permitting and Zoning Data to Predict Upzoning Pressure, Philadelphia

+

Leveraging Permitting and Zoning Data to Predict Upzoning Pressure in Philadelphia

@@ -9205,7 +9239,51 @@

2 Overview

-

The following documentation details the development of a predictive model, which has demonstrated remarkable effectiveness in predicting future development patterns with a low mean absolute error. By accurately forecasting where growth is likely to occur using past permitting data against where current zoning may hinder growth, this model serves as a critical backbone to SmartZoning’s functionality. The study also considers the relationship between development pressure with race, income, and housing cost burden to strengthen the predictive model and investigate the impacts of development locally and city-wide.

+

Below, we outline how we developed a predictive model that effectively forecasts future annual development patterns with a low mean absolute error. This model is the basis of SmartZoning: by anticipating future development, it can highly where current zoning may hinder smart growth. We also consider the relationship between development pressure and race, income, and housing cost burden, demonstrating the generalizability of our model across different socioeconomic contexts and investigation the impacts of development locally and city-wide.

+
+
+Show the code +
required_packages <- c("tidyverse", "sf", "acs", "tidycensus", "sfdep", "kableExtra", "conflicted",
+                       "gganimate", "tmap", "gifski", "transformr", "ggpubr", "randomForest", "janitor",
+                       'igraph', "plotly", "ggcorrplot", "Kendall", "car", "shiny", "leaflet",
+                       "classInt")
+suppressWarnings(
+install_and_load_packages(required_packages)
+)
+
+source("utils/viz_utils.R")
+
+
+
+urls <- c(
+  roads = 'https://opendata.arcgis.com/datasets/261eeb49dfd44ccb8a4b6a0af830fdc8_0.geojson', # for broad and market
+  council_dists = "https://opendata.arcgis.com/datasets/9298c2f3fa3241fbb176ff1e84d33360_0.geojson",
+  building_permits = building_permits_path,
+  permits_bg = final_dataset_path,
+  zoning = "https://opendata.arcgis.com/datasets/0bdb0b5f13774c03abf8dc2f1aa01693_0.geojson",
+  opa = "data/opa_properties.geojson",
+  ols_preds = 'data/model_outputs/ols_preds.geojson',
+  rf_test_preds = 'data/model_outputs/rf_test_preds.geojson',
+  rf_val_preds = 'data/model_outputs/rf_val_preds.geojson',
+  rf_proj_preds = 'data/model_outputs/rf_proj_preds.geojson'
+  
+)
+
+suppressMessages({
+  invisible(
+    imap(urls, ~ assign(.y, phl_spat_read(.x), envir = .GlobalEnv))
+  )
+})
+
+broad_and_market <- roads %>% filter(ST_NAME %in% c('BROAD',"MARKET") | SEG_ID %in% c(440370, 421347,421338,421337,422413,423051,440403,440402,440391,440380))
+
+council_dists <- council_dists %>%
+                    select(DISTRICT)
+
+building_permits <- building_permits %>%
+                      filter(permittype %in% c("RESIDENTIAL BUILDING", "BP_ADDITION", "BP_NEWCNST"))
+
+

3 Select and Engineer Features

@@ -9249,6 +9327,22 @@

3.1 Permits

Firstly, 10 years of permit data from 2012 to 2023 from the Philadelphia Department of Licenses and Inspections are critical to the study. This study filters only for new construction permits granted for residential projects. In the future, filtering for full and substantial renovations could add more nuance to what constitutes as development pressure.

+
+
+Show the code +
tm <- tmap_theme(tm_shape(permits_bg %>% filter(!year %in% c(2012, 2024))) +
+        tm_polygons(col = "permits_count", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_facets(along = "year") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "New Construction Permits per Year\nPhiladelphia, PA")
+
+suppressMessages(
+tmap_animation(tm, "assets/permits_animation.gif", delay = 50)
+)
+
+

@@ -9258,9 +9352,38 @@

expiration of a tax abatement program for developers.

When assessing new construction permit count by Council Districts, a few districts issued the bulk of new permits during that 2021 peak. Hover over the lines to see more about the volume of permits and who granted them.

+
+Show the code +
perms_x_dist <- st_join(building_permits, council_dists)
+
+perms_x_dist_sum <- perms_x_dist %>%
+                  st_drop_geometry() %>%
+                  group_by(DISTRICT, year) %>%
+                  summarize(permits_count = n())
+
+perms_x_dist_mean = perms_x_dist_sum %>%
+                      group_by(year) %>%
+                      summarize(permits_count = mean(permits_count)) %>%
+                      mutate(DISTRICT = "Average")
+
+perms_x_dist_sum <- bind_rows(perms_x_dist_sum, perms_x_dist_mean) %>%
+                        mutate(color = ifelse(DISTRICT != "Average", 0, 1))
+
+ggplotly(
+ggplot(perms_x_dist_sum %>% filter(year > 2013, year < 2024), aes(x = year, y = permits_count, color = as.character(color), group = interaction(DISTRICT, color))) +
+  geom_line(lwd = 0.7) +
+  labs(title = "Permits per Year by Council District",
+       y = "Total Permits") +
+  # facet_wrap(~DISTRICT) +
+  theme_minimal() +
+  theme(axis.title.x = element_blank(),
+        legend.position = "none") +
+  scale_color_manual(values = c(palette[5], palette[1]))
+)
+
-
- +
+

@@ -9268,6 +9391,24 @@

3.2 Feature Engineering by Time and Space

New construction exhibits sizable spatial and temporal autocorrelation. In other words, there is a strong relationship between the number of permits in a given block group and the number of permits in neighboring block groups; as well as between the number of permits issued in a block group in a given year and the number of permits issued in that same block group in the previous year. To account for these relationships, we engineer new features, including both space and time lags. We note that all of these engineered features have strong correlation coefficients with our dependent variable, permits_count, and p-values indicating that these relationships are statistically significant.

+
+Show the code +
permits_bg_long <- permits_bg %>%
+                    filter(!year %in% c(2024)) %>%
+                    st_drop_geometry() %>%
+                    pivot_longer(
+                      cols = c(starts_with("lag"), dist_to_2022),
+                      names_to = "Variable",
+                      values_to = "Value"
+                    )
+
+
+ggscatter(permits_bg_long, x = "permits_count", y = "Value", facet.by = "Variable",
+   add = "reg.line",
+   add.params = list(color = palette[3], fill = palette[5]),
+   conf.int = TRUE
+   ) + stat_cor(method = "pearson", p.accuracy = 0.001, r.accuracy = 0.01)
+

@@ -9278,6 +9419,31 @@

Racial Mix (white vs non-white), median income, and housing cost burden are socioeconomic factors that often play an outsized role in affordability in cities like Philadelphia, with a pervasive and persistent history of housing discrimination and systemic disinvestment. This data is all pulled from the US Census Bureau’s American Community Survey 5-Year survey.

Spatially, is clear that non-white communities earn lower median incomes and experience higher rates of extreme rent burden (household spends more than 35% of income on gross rent).

+
+Show the code +
med_inc <- tmap_theme(tm_shape(permits_bg %>% filter(year == 2022)) +
+        tm_polygons(col = "med_inc", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Med. Inc. ($)") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "Median Income")
+  
+race <- tmap_theme(tm_shape(permits_bg %>% filter(year == 2022)) +
+        tm_polygons(col = "percent_nonwhite", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Nonwhite (%)") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "Race")
+  
+rent_burd <- tmap_theme(tm_shape(permits_bg %>% filter(year == 2022)) +
+        tm_polygons(col = "ext_rent_burden", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Rent Burden (%)") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "darkgrey") +
+  tm_layout(frame = FALSE),
+  "Extreme Rent Burden")
+  
+tmap_arrange(med_inc, race, rent_burd)
+

@@ -9297,6 +9463,28 @@

4.1.1 Correlation Coefficients

+
+Show the code +
corr_vars <- c("total_pop",
+               "med_inc",
+               "percent_nonwhite",
+               "percent_renters",
+               "rent_burden",
+               "ext_rent_burden")
+
+corr_dat <- permits_bg %>% select(all_of(corr_vars), permits_count) %>% select(where(is.numeric)) %>% st_drop_geometry() %>% unique() %>% na.omit()
+
+corr <- round(cor(corr_dat), 2)
+p.mat <- cor_pmat(corr_dat)
+
+ggcorrplot(corr, p.mat = p.mat, hc.order = TRUE,
+    type = "full", insig = "blank", lab = TRUE, colors = c(palette[2], "white", palette[3])) +
+  annotate(
+  geom = "rect",
+  xmin = .5, xmax = 7.5, ymin = 4.5, ymax = 5.5,
+  fill = "transparent", color = "red", alpha = 0.5
+)
+

@@ -9308,6 +9496,16 @@

+
+Show the code +
ols <- lm(permits_count ~ ., data = permits_bg %>% filter(year < 2024) %>% select(-c(mapname, geoid10, year)) %>% st_drop_geometry())
+vif(ols) %>%
+  data.frame() %>%
+  clean_names() %>%
+  select(-df) %>%
+  arrange(desc(gvif)) %>%
+  kablerize()
+

@@ -9754,9 +9952,31 @@

+
+Show the code +
ggplot(building_permits %>% filter(!year %in% c(2024)), aes(x = as.factor(year))) +
+  geom_bar(fill = palette[1], color = NA, alpha = 0.7) +
+  labs(title = "Permits per Year",
+       y = "Count") +
+  theme_minimal() +
+  theme(axis.title.x = element_blank(),
+        aspect.ratio = .75)
+

+
+Show the code +
ggplot(permits_bg %>% st_drop_geometry %>% filter(!year %in% c(2024)), aes(x = permits_count)) +
+  geom_histogram(fill = palette[1], color = NA, alpha = 0.7) +
+  labs(title = "Permits per Block Group per Year",
+       subtitle = "Log-Transformed",
+       y = "Count") +
+  scale_x_log10() +
+  facet_wrap(~year) +
+  theme_minimal() +
+  theme(axis.title.x = element_blank())
+

@@ -9767,11 +9987,64 @@

4.2 Examine Spatial Patterns

To to identify spatial clusters, or hotspots, in geographic data, we performed a Local Moran’s I test. It assesses the degree of spatial autocorrelation, which is the extent to which the permit counts in a block group tend to be similar to neighboring block group. We used a p-value of 0.1 as our hotspot threshold.

+
+Show the code +
lisa <- permits_bg %>% 
+  filter(year == 2023) %>%
+  mutate(nb = st_contiguity(geometry),
+                         wt = st_weights(nb),
+                         permits_lag = st_lag(permits_count, nb, wt),
+          moran = local_moran(permits_count, nb, wt)) %>% 
+  tidyr::unnest(moran) %>% 
+  mutate(pysal = ifelse(p_folded_sim <= 0.1, as.character(pysal), NA),
+         hotspot = case_when(
+           pysal == "High-High" ~ "Yes",
+           TRUE ~ "No"
+         ))
+
+# 
+# palette <- c("High-High" = "#B20016", 
+#              "Low-Low" = "#1C4769", 
+#              "Low-High" = "#24975E", 
+#              "High-Low" = "#EACA97")
+
+morans_i <- tmap_theme(tm_shape(lisa) +
+  tm_polygons(col = "ii", border.alpha = 0, style = "jenks", palette = mono_5_green, title = "Moran's I"),
+  "Local Moran's I")
+
+p_value <- tmap_theme(tm_shape(lisa) +
+  tm_polygons(col = "p_ii", border.alpha = 0, style = "jenks", palette = mono_5_green, title = "P-Value"),
+  "Moran's I P-Value")
+
+sig_hotspots <- tmap_theme(tm_shape(lisa) +
+  tm_polygons(col = "hotspot", border.alpha = 0, style = "cat", palette = c(mono_5_green[1], mono_5_green[5]), textNA = "Not a Hotspot", title = "Hotspot?"),
+  "New Construction Hotspots")
+
+tmap_arrange(morans_i, p_value, sig_hotspots, ncol = 3)
+
-

+

Emergeging hotspots…? If I can get it to work.

+
+
+Show the code +
# stc <- as_spacetime(permits_bg %>% select(permits_count, geoid10, year) %>% na.omit(),
+#                  .loc_col = "geoid10",
+#                  .time_col = "year")
+# 
+# # conduct EHSA
+# ehsa <- emerging_hotspot_analysis(
+#   x = stc,
+#   .var = "permits_count",
+#   k = 1,
+#   nsim = 5
+# )
+# 
+# count(ehsa, classification)
+
+

4.3 Compare Models

@@ -9782,15 +10055,59 @@

4.3.1 OLS

OLS (Ordinary least squares) is a method to explore relationships between a dependent variable and one or more explanatory variables. It considers the strength and direction of these relationships and the goodness of model fit. Our model incorporates three engineered groups of features: space lag, time lag, and distance to 2022. We include this last variable because of the Philadelphia tax abatement policy that led to a significant increase in residential development in the years immediately before 2022 discussed earlier. We used this as a baseline model to compare to Poisson and Random Forest. Given how tightly aligned the observed and predicted prices are we performed dozens of variable combinations to rule out over fitting. We are confident that our variables are generalizable and do not over-fit.

+
+Show the code +
suppressMessages(
+ggplot(ols_preds, aes(x = permits_count, y = ols_preds)) +
+  geom_point() +
+  labs(title = "Predicted vs. Actual Permits: OLS",
+       subtitle = "2022 Data",
+       x = "Actual Permits",
+       y = "Predicted Permits") +
+  geom_abline() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  theme_minimal()
+)
+

+
+Show the code +
ggplot(ols_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], color = NA, alpha = 0.7) +
+  labs(title = "Distribution of Absolute Error per Block Group",
+       subtitle = "OLS, 2022") +
+  theme_minimal()
+

+
+Show the code +
ols_mae <- paste0("MAE: ", round(mean(ols_preds$abs_error, na.rm = TRUE), 2))
+

Our OLS model exhibits a Mean Absolute Error (MAE) of 2.66, a decent performance for a model of its simplicity. However, its efficacy is notably diminished in critical domains where optimization is imperative. Consequently, we intend to enhance the predictive capacity by incorporating more pertinent variables and employing a more sophisticated modeling approach.

+
+Show the code +
ols_preds_map <- tmap_theme(tm_shape(ols_preds) +
+        tm_polygons(col = "ols_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Predicted Permits: OLS")
+
+ols_error_map <- tmap_theme(tm_shape(ols_preds) +
+        tm_polygons(col = "abs_error", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Absolute Error") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Absolute Error: OLS")
+
+tmap_arrange(ols_preds_map, ols_error_map)
+

@@ -9802,15 +10119,59 @@

OLS and Random Forest represent different modeling paradigms. OLS is a linear regression model suitable for capturing linear relationships, while Random Forest is an ensemble method capable of capturing non-linear patterns and offering greater flexibility in handling various data scenarios. Considering, Random Forest is generally less sensitive to multicollinearity because it considers subsets of features in each tree and averages their predictions and because the effect of outliers tends to be mitigated, we decided it worth investigating Random Forest as an alternative model.

Compared to the OLS model, the relationship between predicted vs actual permits…

+
+Show the code +
ggplot(rf_test_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], alpha = 0.7, color = NA) +
+  labs(title = "Distribution of Absolute Error per Block Group",
+       subtitle = "Random Forest, 2022") +
+  theme_minimal()
+

+
+Show the code +
suppressMessages(
+ggplot(rf_test_preds, aes(x = permits_count, y = rf_test_preds)) +
+  geom_point() +
+  labs(title = "Predicted vs. Actual Permits: RF",
+       subtitle = "2022 Data",
+       x = "Actual Permits",
+       y = "Predicted Permits") +
+  geom_abline() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  theme_minimal()
+)
+

+
+Show the code +
rf_test_mae <- paste0("MAE: ", round(mean(rf_test_preds$abs_error, na.rm = TRUE), 2))
+

Compared to the OLS Model, the Random Forest Model has a similar error distribution however, it exhibits a MAE of….

+
+Show the code +
test_preds_map <- tmap_theme(tm_shape(rf_test_preds) +
+        tm_polygons(col = "rf_test_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Predicted Permits: RF Test")
+
+test_error_map <- tmap_theme(tm_shape(rf_test_preds) +
+        tm_polygons(col = "abs_error", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Absolute Error") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Absolute Error: RF Test") 
+
+tmap_arrange(test_preds_map, test_error_map)
+

@@ -9826,15 +10187,59 @@

We train and test up to 2022–we use this for model tuning and feature engineering.

Having settled on our model features and tuning, we now validate on 2023 data.

+
+Show the code +
val_preds_map <- tmap_theme(tm_shape(rf_val_preds) +
+        tm_polygons(col = "rf_val_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Predicted Permits: RF Validate")
+
+val_error_map <- tmap_theme(tm_shape(rf_val_preds) +
+        tm_polygons(col = "abs_error", border.alpha = 0, palette = mono_5_orange, style = "fisher", colorNA = "lightgrey", title = "Absolute Error") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Absolute Error: RF Validate")
+
+tmap_arrange(val_preds_map, val_error_map)
+

+
+Show the code +
ggplot(rf_val_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], alpha = 0.7, color = NA) +
+  labs(title = "Distribution of Absolute Error per Block Group",
+       subtitle = "Random Forest, 2023") +
+  theme_minimal()
+

+
+Show the code +
suppressMessages(
+ggplot(rf_val_preds, aes(x = permits_count, y = rf_val_preds)) +
+  geom_point() +
+  labs(title = "Predicted vs. Actual Permits: RF",
+       subtitle = "2023 Data",
+       x = "Actual Permits",
+       y = "Predicted Permits") +
+  geom_abline() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  theme_minimal()
+)
+

+
+Show the code +
rf_val_mae <- paste0("MAE: ", round(mean(rf_val_preds$abs_error, na.rm = TRUE), 2))
+

We return an MAE of MAE: 2.19.

@@ -9844,6 +10249,16 @@

6.1 Accuracy

Predominately, our model overpredicts, which is better than underpredicting, as it facilitates new development.

+
+Show the code +
nbins <- as.integer(sqrt(nrow(rf_val_preds)))
+vline <- mean(rf_val_preds$abs_error, na.rm = TRUE)
+
+ggplot(rf_val_preds, aes(x = abs_error)) +
+  geom_histogram(fill = palette[3], alpha = 0.7, fill = NA, bins = nbins) +
+  geom_vline(aes(xintercept = vline)) +
+  theme_minimal()
+

@@ -9853,6 +10268,20 @@

6.2 Generalizabiltiy

The constructed boxplot, categorizing observations based on racial composition, indicates that the random forest model generalizes effectively, showcasing consistent and relatively low absolute errors across majority non-white and majority white categories. The discernible similarity in error distributions suggests that the model’s predictive performance remains robust across diverse racial compositions, affirming its ability to generalize successfully.

+
+Show the code +
rf_val_preds <- rf_val_preds %>%
+                      mutate(race_comp = case_when(
+                        percent_nonwhite >= .50 ~ "Majority Non-White",
+                        TRUE ~ "Majority White"
+                      ))
+
+ggplot(rf_val_preds, aes(y = abs_error, color = race_comp)) +
+  geom_boxplot(fill = NA) +
+  scale_color_manual(values = c(mono_5_orange[5], mono_5_orange[3])) +
+ # labs(title = "C")
+  theme_minimal()
+

@@ -9860,12 +10289,44 @@

We find that error is not related to affordability and actually trends downward with percent nonwhite. (This is probably because there is less total development happening there in majority-minority neighborhoods to begin with, so the magnitude of error is less, even though proportionally it might be more.) Error increases slightly with total pop. This makes sense–more people –> more development.

Our analysis reveals that the error is not correlated with affordability and demonstrates a downward trend in conjunction with the percentage of the nonwhite population. This observed pattern may be attributed to the likelihood that majority-minority neighborhoods experience a comparatively lower volume of overall development, thereby diminishing the absolute magnitude of error, despite potential proportional increases. Additionally, there is a slight increase in error with the total population, aligning with the intuitive expectation that higher population figures correspond to more extensive development activities.

+
+Show the code +
rf_val_preds_long <- rf_val_preds %>%
+  pivot_longer(cols = c(rent_burden, percent_nonwhite, total_pop, med_inc),
+               names_to = "variable", values_to = "value") %>%
+  mutate(variable = case_when(
+    variable == "med_inc" ~ "Median Income ($)",
+    variable == "percent_nonwhite" ~ "Nonwhite (%)",
+    variable == "rent_burden" ~ "Rent Burden (%)",
+    TRUE ~ "Total Pop."
+  ))
+
+ggplot(rf_val_preds_long, aes(x = value, y = abs_error)) +
+  geom_point() +
+  geom_smooth(method = "lm", se = FALSE, color = palette[3]) +
+  facet_wrap(~ variable, scales = "free_x") +
+  labs(title = "Generalizability of Absolute Error",
+       x = "Value",
+       y = "Absolute Error") +
+  theme_minimal()
+

How does this generalize across council districts?

+
+Show the code +
suppressMessages(
+  ggplot(rf_val_preds, aes(x = reorder(district, abs_error, FUN = mean), y = abs_error)) +
+    geom_boxplot(fill = NA, color = palette[3]) +
+    labs(title = "MAE by Council District",
+         y = "Mean Absolute Error",
+         x = "Council District") +
+    theme_minimal()
+)
+

@@ -9877,20 +10338,102 @@

+
+Show the code +
filtered_zoning <- zoning %>%
+                     filter(str_detect(CODE, "RS") | str_detect(CODE, "I"),
+                            CODE != "I2",
+                            !str_detect(CODE, "SP")) %>%
+                     st_join(., rf_val_preds %>% select(rf_val_preds))
+
+
+
+zoning_map <- tmap_theme(tm_shape(filtered_zoning) +
+        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey", title = "Zoning Code") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Mismatched Zoning")
+  
+mismatch <- tmap_theme(tm_shape(filtered_zoning) +
+        tm_polygons(col = "rf_val_preds", border.alpha = 0, colorNA = "lightgrey", palette = mono_5_orange, style = "fisher", title = "Predicted New Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Development Pressure")
+
+tmap_arrange(zoning_map, mismatch)
+

We can extract development predictions at the block level to these parcels and then visualize them by highest need.

+
+Show the code +
tmap_mode('view')
+
+filtered_zoning %>%
+  filter(rf_val_preds > 10) %>%
+tm_shape() +
+        tm_polygons(col = "CODE", border.alpha = 0, colorNA = "lightgrey",
+                    popup.vars = c('rf_val_preds', 'CODE')) +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE)
+
-
- +
+

Furthermore, we can identify properties with high potential for assemblage, which suggests the ability to accomodate high-density, multi-unit housing.

+
+Show the code +
nbs <- filtered_zoning %>% 
+  mutate(nb = st_contiguity(geometry))
+
+# Create edge list while handling cases with no neighbors
+edge_list <- tibble::tibble(id = 1:length(nbs$nb), nbs = nbs$nb) %>% 
+  tidyr::unnest(nbs) %>% 
+  filter(nbs != 0)
+
+# Create a graph with a node for each row in filtered_zoning
+g <- make_empty_graph(n = nrow(filtered_zoning))
+V(g)$name <- as.character(1:nrow(filtered_zoning))
+
+# Add edges if they exist
+if (nrow(edge_list) > 0) {
+  edges <- as.matrix(edge_list)
+  g <- add_edges(g, c(t(edges)))
+}
+
+# Calculate the number of contiguous neighbors, handling nodes without neighbors
+n_contiguous <- sapply(V(g)$name, function(node) {
+  if (node %in% edges) {
+    length(neighborhood(g, order = 1, nodes = as.numeric(node))[[1]])
+  } else {
+    1  # Nodes without neighbors count as 1 (themselves)
+  }
+})
+
+filtered_zoning <- filtered_zoning %>%
+                    mutate(n_contig = n_contiguous)
+
+filtered_zoning %>%
+  st_drop_geometry() %>%
+  select(rf_val_preds,
+         n_contig,
+         OBJECTID,
+         CODE) %>%
+  filter(rf_val_preds > 10,
+         n_contig > 2) %>%
+  arrange(desc(rf_val_preds)) %>%
+  kablerize(caption = "Poorly-Zoned Properties with High Development Risk")
+

@@ -10223,6 +10766,17 @@

8 2024 Predictions

+
+Show the code +
tmap_mode('plot')
+
+tmap_theme(tm_shape(rf_proj_preds) +
+        tm_polygons(col = "rf_proj_preds", border.alpha = 0, palette = mono_5_green, style = "fisher", colorNA = "lightgrey", title = "Predicted New Permits") +
+  tm_shape(broad_and_market) +
+  tm_lines(col = "lightgrey") +
+  tm_layout(frame = FALSE),
+  "Projected New Development, 2024")
+

@@ -10230,7 +10784,7 @@

9 Web Application

- +

10 Next Steps