diff --git a/DataWork/descriptive-statistics.R b/DataWork/descriptive-statistics.R new file mode 100644 index 0000000..eb80402 --- /dev/null +++ b/DataWork/descriptive-statistics.R @@ -0,0 +1,101 @@ +# Load packages ################################################################ + +library(here) +library(tidyverse) +library(skimr) +library(lfe) +library(huxtable) +library(openxlsx) + +# Load data +census <- + read_rds(here("DataWork", + "DataSets", + "Final", + "census.RDS")) + +# Exploring the data ########################################################### + +# See summary statistics for the whole data set: +summary(census) +summary(census, 0) + +# See summary statistics for a single variable: +summary(census$region) +summary(census$death) + +# Frequency table using table() +table(census$region) +table(census$state, census$region) + +# Now with skimr +skim(census) + +# Creating custom summary statistics tables #################################### + +summary_stats <- + skim_with(numeric = sfl(Mean = mean, # Variable name = statistic + Median = median, + SD = sd, + Min = min, + Max = max), + append = FALSE) # Remove all default statistics + +summary_stats_table <- + census %>% + summary_stats() %>% + yank("numeric") %>% + select(- complete_rate) %>% + rename(`Missings` = n_missing) + +# Exporting tables ############################################################# + +quick_xlsx(table, + file = here("DataWork", + "Output", + "Raw", + "summary-statistics.xlsx")) + +## Add variable names -------------------------------------------- +# Extract variable labels from data frame +# Extract variable labels from data frame +census_dictionary <- + data.frame("Variable" = attributes(census)$var.labels, + "name" = names(census)) + +summary_stats_table <- + summary_stats_table %>% + rename(name = skim_variable) %>% # Rename var with var names so we can merge the datasets + left_join(census_dictionary) %>% # Merge to variable labels + select(-name) %>% # Keep only variable labels instead of names + as_hux # Convert it into a huxtable object + +summary_stats_table <- + summary_stats_table %>% + relocate(Variable) %>% # Make variable labels the first column + set_header_rows(1, TRUE) %>% # Use stats name as table header + set_header_cols("Variable", TRUE) %>% # Use variable name as row header + set_number_format(everywhere, 2:ncol(.), "%9.0f") %>% # Don't round large numbers + theme_basic() # Set a theme for quick formatting + +quick_xlsx(summary_stats_table, + file = here("DataWork", + "Output", + "Raw", + "summary-statistics-formatted.xlsx")) + +# Aggregate tables ########################################################### + +census_region <- + census %>% + group_by(region) %>% + summarise(`Number of States` = n_distinct(state), + `Total Population` = sum(pop)) + + +# Run regression ###################################################### + +reg1 <- + lm(divorce ~ pop + popurban + marriage, + census) + diff --git a/Presentations/05-descriptive-analysis.Rmd b/Presentations/05-descriptive-analysis.Rmd index 07e44e3..9043abf 100644 --- a/Presentations/05-descriptive-analysis.Rmd +++ b/Presentations/05-descriptive-analysis.Rmd @@ -129,6 +129,30 @@ But that's enough of me talking. Let's get you all to run some code # Setting the stage + +Load the packages that we will use today +```{r, eval = F} + # Install new packages + install.packages("skimr") + install.packages("lfe") + install.packages("huxtable") + install.packages("openxlsx") +``` + +```{r, warning = FALSE} + # Load packages + library(here) + library(tidyverse) + library(skimr) + library(lfe) + library(huxtable) + library(openxlsx) +``` + +--- + +# Setting the stage + Load the data that we will use today: Stata's `census` dataset ```{r} # Load data @@ -161,27 +185,6 @@ glimpse(census) --- -# Setting the stage - - -Load the packages that we will use today -```{r, eval = F} - # Install new packages - install.packages("skimr") - install.packages("lfe") - install.packages("huxtable") -``` - -```{r, warning = FALSE} - # Load packages - library(tidyverse) - library(skimr) - library(lfe) - library(huxtable) -``` - ---- - class: inverse, center, middle name: exploring @@ -214,8 +217,8 @@ Use the `summary()` function to describe the `census` data frame. # Exploring a dataset -```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") +```{r} +summary(census) ``` --- @@ -238,11 +241,6 @@ include_app("https://luizaandrade.shinyapps.io/learnr/") Use the `summary()` function to display summary statistics for a continuous variable in the `census` data frame. ] -```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") -``` - - --- # Summarizing categorical variables @@ -268,8 +266,10 @@ Use the `table()` function to display frequency tables for: ## One way tabulation -```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") +-- + +```{r} +table(census$region) ``` @@ -279,8 +279,10 @@ include_app("https://luizaandrade.shinyapps.io/learnr/") ## Two way tabulation +-- + ```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") +table(census$state, census$region) ``` @@ -402,12 +404,10 @@ census %>% ``` Here are a few functions that can be used within `sfl()`: -- Center: `mean()`, `median()` -- Spread: `sd()`, `IQR()`, `mad()` -- Range: `min()`, `max()`, `quantile()` -- Position: `first()`, `last()`, `nth()`, -- Count: `n()`, `n_distinct()` -- Logical: `any()`, `all()` +- Center: `mean`, `median` +- Spread: `sd`, `IQR`, `variance` +- Range: `min`, `max`, `quantile` +- Count: `n_complete`, `n_missing`, `length` --- @@ -574,7 +574,7 @@ summary_stats_table <- #<< relocate(Variable) %>% # Make variable labels the first column #<< set_header_rows(1, TRUE) %>% # Use stats name as table header #<< set_header_cols("Variable", TRUE) %>% # Use variable name as row header #<< - set_number_format("\"%9.0f\"" ) %>% # Don't round large numbers #<< + set_number_format(everywhere, 2:ncol(.), "%9.0f") %>% # Don't round large numbers theme_basic() # Set a theme for quick formatting #<< ``` @@ -591,7 +591,7 @@ summary_stats_table <- #<< # Beautifying tables .small[ -```{r, message = FALSE, eval = F} +```{r, message = FALSE} # Extract variable labels from data frame census_dictionary <- @@ -611,12 +611,15 @@ summary_stats_table <- relocate(Variable) %>% # Make variable labels the first column set_header_rows(1, TRUE) %>% # Use stats name as table header set_header_cols("Variable", TRUE) %>% # Use variable name as row header - set_number_format("\"%9.0f\"" ) %>% # Don't round large numbers + set_number_format(everywhere, 2:ncol(.), "%9.0f") %>% # Don't round large numbers theme_basic() # Set a theme for quick formatting # Now export it #<< quick_xlsx(summary_stats_table, #<< - file = file.path(rawOutput, "summary-stats-basic.xlsx")) #<< + file = here("DataWork", #<< + "Output", #<< + "Raw", #<< + "summary-stats-basic.xlsx")) #<< quick_latex(summary_stats_table, #<< file = here("DataWork", #<< @@ -706,11 +709,6 @@ The "name-value" pairs mentioned under `...` look like this: `new_variable = sta Recreate the `region_stats` data set, now including the average and the standard deviation of the population. ] - -```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") -``` - --- # Aggregating observations @@ -820,8 +818,10 @@ Using the `census` data, run a regression of the number of divorces on populatio ] -```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") +```{r} +reg1 <- + lm(divorce ~ pop + popurban + marriage, + census) ``` @@ -897,8 +897,12 @@ Using the `census` data, run a regression of the number of divorces on populatio Using the `census` data, run a regression of divorce on population, urban population and number of marriages controlling for region fixed effects. ] -```{r, echo = FALSE} -include_app("https://luizaandrade.shinyapps.io/learnr/") +```{r, evalt = FALSE} +reg2 <- + felm(divorce ~ pop + popurban + marriage | region | 0 | 0, + census) + +summary(reg2) ``` --- diff --git a/Presentations/05-descriptive-analysis.html b/Presentations/05-descriptive-analysis.html index 46ae738..5a30d5c 100644 --- a/Presentations/05-descriptive-analysis.html +++ b/Presentations/05-descriptive-analysis.html @@ -20,6 +20,7 @@ +